From a0a38df95cbd931b4370b227f8886fb9eb8c1398 Mon Sep 17 00:00:00 2001
From: Dhanashree Kashid <dkashid@pivotal.io>
Date: Thu, 15 Mar 2018 14:15:53 -0700
Subject: [PATCH] Fix wrong results for queries of type NOT (select <>)

For queries of the form "NOT (subselect)", planner lost the "NOT" operator
during the initial pull-up in pull_up_sublinks_qual_recurse() which resulted in
incorrect filter and hence wrong results. Inside
pull_up_sublinks_qual_recurse(), when a qual contains NOT, we check if sublink
type is any one of these: EXISTS, ANY or ALL, and invoke appropriate sublink
pull up routines. In case of qual of the form "NOT (SELECT <>)" the sublink
type is EXPR; hence we recurse into the argument of NOT; at which point we
lose the information about NOT operator.

This commit fixes the issue by returning the node unmodified when the argument
of NOT is an EXPR sublink. The EXPR sublink, later gets pulled up by
preprocess_qual_conditions() wherein, pull_up_sublinks() is invoked again to
handle sublinks in an expression.

Signed-off-by: Sambitesh Dash <sdash@pivotal.io>
---
 src/backend/optimizer/prep/prepjointree.c     | 21 +++---
 .../regress/expected/qp_correlated_query.out  | 73 ++++++++++++------
 .../qp_correlated_query_optimizer.out         | 75 ++++++++++++++-----
 src/test/regress/expected/subselect_gp.out    |  8 ++
 .../expected/subselect_gp_optimizer.out       |  8 ++
 src/test/regress/sql/qp_correlated_query.sql  | 20 +++--
 src/test/regress/sql/subselect_gp.sql         |  5 ++
 7 files changed, 156 insertions(+), 54 deletions(-)

diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index 4d0fadf03f..84296b2593 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -396,13 +396,13 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
 	if (not_clause(node))
 	{
 		/* If the immediate argument of NOT is EXISTS, try to convert */
-		SubLink    *sublink = (SubLink *) get_notclausearg((Expr *) node);
 		Node	   *arg = (Node *) get_notclausearg((Expr *) node);
 		JoinExpr   *j;
 		Relids		child_rels;
 
-		if (sublink && IsA(sublink, SubLink))
+		if (arg && IsA(arg, SubLink))
 		{
+			SubLink    *sublink = (SubLink *) arg;
 			if (sublink->subLinkType == EXISTS_SUBLINK)
 			{
 				Node* subst;
@@ -440,17 +440,18 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
 			 *		 NOT val op ANY (subq)	 =>  val op' ALL (subq)
 			 *		 NOT val op ALL (subq)	 =>  val op' ANY (subq)
 			 */
-			else if (sublink->subLinkType == ANY_SUBLINK)
+			else if (sublink->subLinkType == ANY_SUBLINK || sublink->subLinkType == ALL_SUBLINK)
 			{
-				sublink->subLinkType = ALL_SUBLINK;
+				sublink->subLinkType = (sublink->subLinkType == ANY_SUBLINK) ? ALL_SUBLINK : ANY_SUBLINK;
 				sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
+				return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
 			}
-			else if (sublink->subLinkType == ALL_SUBLINK)
-			{
-				sublink->subLinkType = ANY_SUBLINK;
-				sublink->testexpr = (Node *) canonicalize_qual(make_notclause((Expr *) sublink->testexpr));
-			}
-			return pull_up_sublinks_qual_recurse(root, (Node *) sublink, available_rels, jtlink);
+
+			/*
+			 * Return the node unmodified for "NOT (subq)"
+			 * This subquery will get pulled up later during preprocess_qual_conditions()
+			 */
+			return node;
 		}
 
 		else if (not_clause(arg))
diff --git a/src/test/regress/expected/qp_correlated_query.out b/src/test/regress/expected/qp_correlated_query.out
index 65d0e2c902..80f670e076 100644
--- a/src/test/regress/expected/qp_correlated_query.out
+++ b/src/test/regress/expected/qp_correlated_query.out
@@ -3537,33 +3537,64 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
 (0 rows)
 
 -- start_ignore
-DROP TABLE IF EXISTS foo;
-NOTICE:  table "foo" does not exist, skipping
-DROP TABLE IF EXISTS bar;
-NOTICE:  table "bar" does not exist, skipping
-CREATE TABLE foo(a int, b int);
+DROP TABLE IF EXISTS qp_tab1;
+NOTICE:  table "qp_tab1" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab2;
+NOTICE:  table "qp_tab2" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab3;
+NOTICE:  table "qp_tab3" does not exist, skipping
+CREATE TABLE qp_tab1(a int, b int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-CREATE TABLE bar(c int, d int);
+CREATE TABLE qp_tab2(c int, d int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE qp_tab3(e int, f int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore
-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
-                                       QUERY PLAN                                       
-----------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=4074.50..1712216.48 rows=7414 width=4)
-   ->  Hash Anti Join  (cost=4074.50..1712216.48 rows=2472 width=4)
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=2.04..3.14 rows=4 width=4)
+   ->  Hash Anti Join  (cost=2.04..3.14 rows=2 width=4)
          Hash Cond: f1.a = f2.a
-         ->  Hash Left Join  (cost=2037.25..95878.62 rows=2471070 width=4)
-               Hash Cond: f1.a = bar.c
-               ->  Seq Scan on foo f1  (cost=0.00..961.00 rows=28700 width=4)
-               ->  Hash  (cost=961.00..961.00 rows=28700 width=4)
-                     ->  Seq Scan on bar  (cost=0.00..961.00 rows=28700 width=4)
-         ->  Hash  (cost=961.00..961.00 rows=28700 width=4)
-               ->  Seq Scan on foo f2  (cost=0.00..961.00 rows=28700 width=4)
- Settings:  optimizer=off
- Optimizer status: legacy query optimizer
-(12 rows)
+         ->  Hash Left Join  (cost=1.02..2.07 rows=2 width=4)
+               Hash Cond: f1.a = qp_tab2.c
+               ->  Seq Scan on qp_tab1 f1  (cost=0.00..1.01 rows=1 width=4)
+               ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+                     ->  Seq Scan on qp_tab2  (cost=0.00..1.01 rows=1 width=4)
+         ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+               ->  Seq Scan on qp_tab1 f2  (cost=0.00..1.01 rows=1 width=4)
+ Optimizer: legacy query optimizer
+(11 rows)
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice2; segments: 3)  (cost=3.08..3.09 rows=1 width=4)
+   ->  HashAggregate  (cost=3.08..3.09 rows=1 width=4)
+         Group Key: qp_tab1.a
+         InitPlan 1 (returns $0)  (slice3)
+           ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=1.02..2.07 rows=4 width=0)
+                 ->  Hash Semi Join  (cost=1.02..2.07 rows=2 width=0)
+                       Hash Cond: qp_tab2.c = qp_tab3.e
+                       ->  Seq Scan on qp_tab2  (cost=0.00..1.01 rows=1 width=4)
+                       ->  Hash  (cost=1.01..1.01 rows=1 width=4)
+                             ->  Seq Scan on qp_tab3  (cost=0.00..1.01 rows=1 width=4)
+         ->  Result  (cost=0.00..1.01 rows=1 width=4)
+               One-Time Filter: NOT $0
+               ->  Seq Scan on qp_tab1  (cost=0.00..1.01 rows=1 width=4)
+ Optimizer: legacy query optimizer
+(14 rows)
+
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+ a 
+---
+(0 rows)
 
 -- ----------------------------------------------------------------------
 -- Test: teardown.sql
diff --git a/src/test/regress/expected/qp_correlated_query_optimizer.out b/src/test/regress/expected/qp_correlated_query_optimizer.out
index 6f67cee17d..c80a6c01b7 100644
--- a/src/test/regress/expected/qp_correlated_query_optimizer.out
+++ b/src/test/regress/expected/qp_correlated_query_optimizer.out
@@ -3694,34 +3694,75 @@ select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) or
 (0 rows)
 
 -- start_ignore
-DROP TABLE IF EXISTS foo;
-NOTICE:  table "foo" does not exist, skipping
-DROP TABLE IF EXISTS bar;
-NOTICE:  table "bar" does not exist, skipping
-CREATE TABLE foo(a int, b int);
+DROP TABLE IF EXISTS qp_tab1;
+NOTICE:  table "qp_tab1" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab2;
+NOTICE:  table "qp_tab2" does not exist, skipping
+DROP TABLE IF EXISTS qp_tab3;
+NOTICE:  table "qp_tab3" does not exist, skipping
+CREATE TABLE qp_tab1(a int, b int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-CREATE TABLE bar(c int, d int);
+CREATE TABLE qp_tab2(c int, d int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c' as the Greenplum Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE qp_tab3(e int, f int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'e' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore
-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
-                                     QUERY PLAN                                      
--------------------------------------------------------------------------------------
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..1293.00 rows=2 width=4)
    ->  Hash Left Join  (cost=0.00..1293.00 rows=1 width=4)
-         Hash Cond: qp_correlated_query.foo.a = bar.c
+         Hash Cond: qp_correlated_query.qp_tab1.a = qp_tab2.c
          ->  Hash Anti Join  (cost=0.00..862.00 rows=1 width=4)
-               Hash Cond: qp_correlated_query.foo.a = qp_correlated_query.foo.a
-               ->  Table Scan on foo  (cost=0.00..431.00 rows=1 width=4)
+               Hash Cond: qp_correlated_query.qp_tab1.a = qp_correlated_query.qp_tab1.a
+               ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
                ->  Hash  (cost=431.00..431.00 rows=1 width=4)
                      ->  Result  (cost=0.00..431.00 rows=1 width=4)
-                           ->  Table Scan on foo  (cost=0.00..431.00 rows=1 width=4)
+                           ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
          ->  Hash  (cost=431.00..431.00 rows=1 width=4)
-               ->  Table Scan on bar  (cost=0.00..431.00 rows=1 width=4)
- Settings:  optimizer=on
- Optimizer status: PQO version 2.46.2
-(13 rows)
+               ->  Table Scan on qp_tab2  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: PQO version 2.55.12
+(12 rows)
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+                                                                  QUERY PLAN                                                                   
+-----------------------------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  (cost=0.00..1765377.02 rows=1 width=4)
+   ->  GroupAggregate  (cost=0.00..1765377.02 rows=1 width=4)
+         Group Key: qp_tab1.a
+         ->  Sort  (cost=0.00..1765377.02 rows=1 width=4)
+               Sort Key: qp_tab1.a
+               ->  Result  (cost=0.00..1765377.02 rows=1 width=4)
+                     Filter: NOT "outer".bool
+                     ->  Nested Loop Left Join  (cost=0.00..1765377.02 rows=1 width=5)
+                           Join Filter: true
+                           ->  Table Scan on qp_tab1  (cost=0.00..431.00 rows=1 width=4)
+                           ->  Assert  (cost=0.00..862.00 rows=1 width=1)
+                                 Assert Cond: (row_number() OVER (?)) = 1
+                                 ->  Materialize  (cost=0.00..862.00 rows=1 width=9)
+                                       ->  Broadcast Motion 1:3  (slice2)  (cost=0.00..862.00 rows=3 width=9)
+                                             ->  Result  (cost=0.00..862.00 rows=1 width=9)
+                                                   ->  WindowAgg  (cost=0.00..862.00 rows=1 width=9)
+                                                         ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.00 rows=1 width=1)
+                                                               ->  Result  (cost=0.00..862.00 rows=1 width=1)
+                                                                     ->  Hash Semi Join  (cost=0.00..862.00 rows=1 width=1)
+                                                                           Hash Cond: qp_tab2.c = qp_tab3.e
+                                                                           ->  Table Scan on qp_tab2  (cost=0.00..431.00 rows=1 width=4)
+                                                                           ->  Hash  (cost=431.00..431.00 rows=1 width=4)
+                                                                                 ->  Table Scan on qp_tab3  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: PQO version 2.55.12
+(24 rows)
+
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+ a 
+---
+(0 rows)
 
 -- ----------------------------------------------------------------------
 -- Test: teardown.sql
diff --git a/src/test/regress/expected/subselect_gp.out b/src/test/regress/expected/subselect_gp.out
index 32a31f63bb..e606ef1428 100644
--- a/src/test/regress/expected/subselect_gp.out
+++ b/src/test/regress/expected/subselect_gp.out
@@ -1710,3 +1710,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
  t
 (1 row)
 
+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
+ col1 
+------
+(0 rows)
+
diff --git a/src/test/regress/expected/subselect_gp_optimizer.out b/src/test/regress/expected/subselect_gp_optimizer.out
index 273e3d0674..3ebdf356d5 100644
--- a/src/test/regress/expected/subselect_gp_optimizer.out
+++ b/src/test/regress/expected/subselect_gp_optimizer.out
@@ -1768,3 +1768,11 @@ SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk
  t
 (1 row)
 
+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
+ col1 
+------
+(0 rows)
+
diff --git a/src/test/regress/sql/qp_correlated_query.sql b/src/test/regress/sql/qp_correlated_query.sql
index 683c3eae34..72e22f0ee9 100644
--- a/src/test/regress/sql/qp_correlated_query.sql
+++ b/src/test/regress/sql/qp_correlated_query.sql
@@ -834,14 +834,22 @@ select rnum, c1, c2 from qp_tjoin2 where 75 > all ( select c2 from qp_tjoin1) or
 select rnum, c1, c2 from qp_tjoin2 where 20 > all ( select c1 from qp_tjoin1) order by rnum;
 
 -- start_ignore
-DROP TABLE IF EXISTS foo;
-DROP TABLE IF EXISTS bar;
-
-CREATE TABLE foo(a int, b int);
-CREATE TABLE bar(c int, d int);
+DROP TABLE IF EXISTS qp_tab1;
+DROP TABLE IF EXISTS qp_tab2;
+DROP TABLE IF EXISTS qp_tab3;
+
+CREATE TABLE qp_tab1(a int, b int);
+CREATE TABLE qp_tab2(c int, d int);
+CREATE TABLE qp_tab3(e int, f int);
+INSERT INTO qp_tab1 VALUES (1,2);
+INSERT INTO qp_tab2 VALUES (3,4);
+INSERT INTO qp_tab3 VALUES (4,5);
 -- end_ignore
 
-EXPLAIN SELECT a FROM foo f1 LEFT JOIN bar on a=c WHERE NOT EXISTS(SELECT 1 FROM foo f2 WHERE f1.a = f2.a);
+EXPLAIN SELECT a FROM qp_tab1 f1 LEFT JOIN qp_tab2 on a=c WHERE NOT EXISTS(SELECT 1 FROM qp_tab1 f2 WHERE f1.a = f2.a);
+
+EXPLAIN SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
+SELECT DISTINCT a FROM qp_tab1 WHERE NOT (SELECT TRUE FROM qp_tab2 WHERE EXISTS (SELECT * FROM qp_tab3 WHERE qp_tab2.c = qp_tab3.e));
 
 -- ----------------------------------------------------------------------
 -- Test: teardown.sql
diff --git a/src/test/regress/sql/subselect_gp.sql b/src/test/regress/sql/subselect_gp.sql
index 71ad718269..6b874181b0 100644
--- a/src/test/regress/sql/subselect_gp.sql
+++ b/src/test/regress/sql/subselect_gp.sql
@@ -727,3 +727,8 @@ EXPLAIN select count(distinct ss.ten) from
 EXPLAIN SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;
 
 SELECT EXISTS(SELECT * FROM tenk1 WHERE tenk1.unique1 = tenk2.unique1) FROM tenk2 LIMIT 1;
+
+--
+-- Ensure that NOT is not lost during subquery pull-up
+--
+SELECT 1 AS col1 WHERE NOT (SELECT 1 = 1);
-- 
GitLab