Fix wrong results for queries of type NOT (select <>)

For queries of the form "NOT (subselect)", planner lost the "NOT" operator during the initial pull-up in pull_up_sublinks_qual_recurse() which resulted in incorrect filter and hence wrong results. Inside pull_up_sublinks_qual_recurse(), when a qual contains NOT, we check if sublink type is any one of these: EXISTS, ANY or ALL, and invoke appropriate sublink pull up routines. In case of qual of the form "NOT (SELECT <>)" the sublink type is EXPR; hence we recurse into the argument of NOT; at which point we lose the information about NOT operator. This commit fixes the issue by returning the node unmodified when the argument of NOT is an EXPR sublink. The EXPR sublink, later gets pulled up by preprocess_qual_conditions() wherein, pull_up_sublinks() is invoked again to handle sublinks in an expression. Signed-off-by: N Sambitesh Dash <sdash@pivotal.io>

Fix wrong results for queries of type NOT (select <>)
For queries of the form "NOT (subselect)", planner lost the "NOT" operator during the initial pull-up in pull_up_sublinks_qual_recurse() which resulted in incorrect filter and hence wrong results. Inside pull_up_sublinks_qual_recurse(), when a qual contains NOT, we check if sublink type is any one of these: EXISTS, ANY or ALL, and invoke appropriate sublink pull up routines. In case of qual of the form "NOT (SELECT <>)" the sublink type is EXPR; hence we recurse into the argument of NOT; at which point we lose the information about NOT operator. This commit fixes the issue by returning the node unmodified when the argument of NOT is an EXPR sublink. The EXPR sublink, later gets pulled up by preprocess_qual_conditions() wherein, pull_up_sublinks() is invoked again to handle sublinks in an expression. Signed-off-by: N Sambitesh Dash <sdash@pivotal.io>
a0a38df9 · Dhanashree Kashid · 30271024 · a0a38df9 · a0a38df9 · a0a38df9
7 changed file
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -396,13 +396,13 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
 	if (not_clause(node))
 	{
 		/* If the immediate argument of NOT is EXISTS, try to convert */
-		SubLink    *sublink = (SubLink *) get_notclausearg((Expr *) node);
 		Node	   *arg = (Node *) get_notclausearg((Expr *) node);
 		JoinExpr   *j;
 		Relids		child_rels;

-		if (sublink && IsA(sublink, SubLink))
+		if (arg && IsA(arg, SubLink))
 		{
+			SubLink    *sublink = (SubLink *) arg;
 			if (sublink->subLinkType == EXISTS_SUBLINK)
 			{
 				Node* subst;
@@ -440,17 +440,18 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
 			 *		 NOT val op ANY (subq)	 =>  val op' ALL (subq)
 			 *		 NOT val op ALL (subq)	 =>  val op' ANY (subq)
 			 */
-			else if (sublink->subLinkType == ANY_SUBLINK)
+			else if (sublink->subLinkType == ANY_SUBLINK || sublink->subLinkType == ALL_SUBLINK)
 			{
-				sublink->subLinkType = ALL_SUBLINK;
+				sublink->subLinkType = (sublink->subLinkType == ANY_SUBLINK) ? ALL_SUBLINK : ANY_SUBLINK;
 				sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
+				return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
 			}
-			else if (sublink->subLinkType == ALL_SUBLINK)
-			{
-				sublink->subLinkType = ANY_SUBLINK;
-				sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
-			}
-			return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
+
+			/*
+			 * Return the node unmodified for "NOT (subq)"
+			 * This subquery will get pulled up later during preprocess_qual_conditions()
+			 */
+			return node;
 		}

 		else if (not_clause(arg))

--- a/src/test/regress/expected/qp_correlated_query.out
+++ b/src/test/regress/expected/qp_correlated_query.out
@@ -3537,33 +3537,64 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
 (0 rows)

 -- start_ignore
-DROP TABLE IF EXISTS foo;
-NOTICE:  table "foo" does not exist, skipping
-DROP TABLE IF EXISTS bar;
-NOTICE:  table "bar" does not exist, skipping
-CREATE TABLE foo(a int, b int);
+DROP TABLE IF EXISTS qp_tab1;
+NOTICE:  table "qp_tab1" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab2;
+NOTICE:  table "qp_tab2" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab3;
+NOTICE:  table "qp_tab3" does not exist, skipping
+CREATE TABLE qp_tab1(a int, b int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-CREATE TABLE bar(c int, d int);
+CREATE TABLE qp_tab2(c int, d int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE qp_tab3(e int, f int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore
-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
-                                       QUERY PLAN                                       
----------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=4074.50..1712216.48 rows=7414 width=4)
-   ->  Hash Anti Join  (cost=4074.50..1712216.48 rows=2472 width=4)
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2.04..3.14 rows=4 width=4)
+   ->  Hash Anti Join  (cost=2.04..3.14 rows=2 width=4)
         Hash Cond: f1.a = f2.a
-         ->  Hash Left Join  (cost=2037.25..95878.62 rows=2471070 width=4)
-               Hash Cond: f1.a = bar.c
-               ->  Seq Scan on foo f1  (cost=0.00..961.00 rows=28700 width=4)
-               ->  Hash  (cost=961.00..961.00 rows=28700 width=4)
-                     ->  Seq Scan on bar  (cost=0.00..961.00 rows=28700 width=4)
-         ->  Hash  (cost=961.00..961.00 rows=28700 width=4)
-               ->  Seq Scan on foo f2  (cost=0.00..961.00 rows=28700 width=4)
- Settings:  optimizer=off
- Optimizer status: legacy query optimizer
-(12 rows)
+         ->  Hash Left Join  (cost=1.02..2.07 rows=2 width=4)
+               Hash Cond: f1.a = qp_tab2.c
+               ->  Seq Scan on qp_tab1 f1  (cost=0.00..1.01 rows=1 width=4)
+               ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+                     ->  Seq Scan on qp_tab2  (cost=0.00..1.01 rows=1 width=4)
+         ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+               ->  Seq Scan on qp_tab1 f2  (cost=0.00..1.01 rows=1 width=4)
+ Optimizer: legacy query optimizer
+(11 rows)
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice2; segments: 3)  (cost=3.08..3.09 rows=1 width=4)
+   ->  HashAggregate  (cost=3.08..3.09 rows=1 width=4)
+         Group Key: qp_tab1.a
+         InitPlan 1 (returns $0)  (slice3)
+           ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=1.02..2.07 rows=4 width=0)
+                 ->  Hash Semi Join  (cost=1.02..2.07 rows=2 width=0)
+                       Hash Cond: qp_tab2.c = qp_tab3.e
+                       ->  Seq Scan on qp_tab2  (cost=0.00..1.01 rows=1 width=4)
+                       ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+                             ->  Seq Scan on qp_tab3  (cost=0.00..1.01 rows=1 width=4)
+         ->  Result  (cost=0.00..1.01 rows=1 width=4)
+               One-Time Filter: NOT $0
+               ->  Seq Scan on qp_tab1  (cost=0.00..1.01 rows=1 width=4)
+ Optimizer: legacy query optimizer
+(14 rows)
+
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+ a 
+---
+(0 rows)

 -- ----------------------------------------------------------------------
 -- Test: teardown.sql

--- a/src/test/regress/expected/qp_correlated_query_optimizer.out
+++ b/src/test/regress/expected/qp_correlated_query_optimizer.out
@@ -3694,34 +3694,75 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
 (0 rows)

 -- start_ignore
-DROP TABLE IF EXISTS foo;
-NOTICE:  table "foo" does not exist, skipping
-DROP TABLE IF EXISTS bar;
-NOTICE:  table "bar" does not exist, skipping
-CREATE TABLE foo(a int, b int);
+DROP TABLE IF EXISTS qp_tab1;
+NOTICE:  table "qp_tab1" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab2;
+NOTICE:  table "qp_tab2" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab3;
+NOTICE:  table "qp_tab3" does not exist, skipping
+CREATE TABLE qp_tab1(a int, b int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-CREATE TABLE bar(c int, d int);
+CREATE TABLE qp_tab2(c int, d int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE qp_tab3(e int, f int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore
-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
-                                     QUERY PLAN                                      
-------------------------------------------------------------------------------------
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..1293.00 rows=2 width=4)
   ->  Hash Left Join  (cost=0.00..1293.00 rows=1 width=4)
-         Hash Cond: qp_correlated_query.foo.a = bar.c
+         Hash Cond: qp_correlated_query.qp_tab1.a = qp_tab2.c
         ->  Hash Anti Join  (cost=0.00..862.00 rows=1 width=4)
-               Hash Cond: qp_correlated_query.foo.a = qp_correlated_query.foo.a
-               ->  Table Scan on foo  (cost=0.00..431.00 rows=1 width=4)
+               Hash Cond: qp_correlated_query.qp_tab1.a = qp_correlated_query.qp_tab1.a
+               ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
               ->  Hash  (cost=431.00..431.00 rows=1 width=4)
                     ->  Result  (cost=0.00..431.00 rows=1 width=4)
-                           ->  Table Scan on foo  (cost=0.00..431.00 rows=1 width=4)
+                           ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
         ->  Hash  (cost=431.00..431.00 rows=1 width=4)
-               ->  Table Scan on bar  (cost=0.00..431.00 rows=1 width=4)
- Settings:  optimizer=on
- Optimizer status: PQO version 2.46.2
-(13 rows)
+               ->  Table Scan on qp_tab2  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: PQO version 2.55.12
+(12 rows)
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+                                                                  QUERY PLAN                                                                   
+-----------------------------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  (cost=0.00..1765377.02 rows=1 width=4)
+   ->  GroupAggregate  (cost=0.00..1765377.02 rows=1 width=4)
+         Group Key: qp_tab1.a
+         ->  Sort  (cost=0.00..1765377.02 rows=1 width=4)
+               Sort Key: qp_tab1.a
+               ->  Result  (cost=0.00..1765377.02 rows=1 width=4)
+                     Filter: NOT "outer".bool
+                     ->  Nested Loop Left Join  (cost=0.00..1765377.02 rows=1 width=5)
+                           Join Filter: true
+                           ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
+                           ->  Assert  (cost=0.00..862.00 rows=1 width=1)
+                                 Assert Cond: (row_number() OVER (?)) = 1
+                                 ->  Materialize  (cost=0.00..862.00 rows=1 width=9)
+                                       ->  Broadcast Motion 1:3  (slice2)  (cost=0.00..862.00 rows=3 width=9)
+                                             ->  Result  (cost=0.00..862.00 rows=1 width=9)
+                                                   ->  WindowAgg  (cost=0.00..862.00 rows=1 width=9)
+                                                         ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.00 rows=1 width=1)
+                                                               ->  Result  (cost=0.00..862.00 rows=1 width=1)
+                                                                     ->  Hash Semi Join  (cost=0.00..862.00 rows=1 width=1)
+                                                                           Hash Cond: qp_tab2.c = qp_tab3.e
+                                                                           ->  Table Scan on qp_tab2  (cost=0.00..431.00 rows=1 width=4)
+                                                                           ->  Hash  (cost=431.00..431.00 rows=1 width=4)
+                                                                                 ->  Table Scan on qp_tab3  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: PQO version 2.55.12
+(24 rows)
+
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+ a 
+---
+(0 rows)

 -- ----------------------------------------------------------------------
 -- Test: teardown.sql

--- a/src/test/regress/expected/subselect_gp.out
+++ b/src/test/regress/expected/subselect_gp.out
@@ -1710,3 +1710,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
 t
 (1 row)

+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
+ col1 
+------
+(0 rows)
+
--- a/src/test/regress/expected/subselect_gp_optimizer.out
+++ b/src/test/regress/expected/subselect_gp_optimizer.out
@@ -1768,3 +1768,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
 t
 (1 row)

+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
+ col1 
+------
+(0 rows)
+
--- a/src/test/regress/sql/qp_correlated_query.sql
+++ b/src/test/regress/sql/qp_correlated_query.sql
@@ -834,14 +834,22 @@ select rnum, c1, c2 from qp_tjoin2 where 75 > all ( select c2 from qp_tjoin1) or
 select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) order by rnum;

 -- start_ignore
-DROP TABLE IF EXISTS foo;
-DROP TABLE IF EXISTS bar;
-
-CREATE TABLE foo(a int, b int);
-CREATE TABLE bar(c int, d int);
+DROP TABLE IF EXISTS qp_tab1;
+DROP TABLE IF EXISTS qp_tab2;
+DROP TABLE IF EXISTS qp_tab3;
+
+CREATE TABLE qp_tab1(a int, b int);
+CREATE TABLE qp_tab2(c int, d int);
+CREATE TABLE qp_tab3(e int, f int);
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore

-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));

 -- ----------------------------------------------------------------------
 -- Test: teardown.sql

--- a/src/test/regress/sql/subselect_gp.sql
+++ b/src/test/regress/sql/subselect_gp.sql
@@ -727,3 +727,8 @@ EXPLAIN select count(distinct ss.ten) from
 EXPLAIN SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;

 SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;
+
+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);