From 343f8826435e2d2b2f2cf8a5683e19d4d3ed6add Mon Sep 17 00:00:00 2001 From: Jinbao Chen Date: Wed, 21 Oct 2020 17:18:03 +0800 Subject: [PATCH] The inner relation of LASJ_NOTIN should not have partition locaus The result of NULL not in an unempty set is false. The result of NULL not in an empty set is true. But if an unempty set has partitioned locus. This set will be divided into several subsets. Some subsets may be empty. Because NULL not in empty set equals true. There will be some tuples that shouldn't exist in the result set. The patch disable the partitioned locus of inner table by removing the join clause from the redistribution_clauses. this commit cherry pick from 6X_STABLE 8c93db54f3d93a890493f6a6d532f841779a9188 Co-authored-by: Hubert Zhang Co-authored-by: Richard Guo --- src/backend/cdb/cdbsubselect.c | 90 ++++++++++++- src/test/regress/expected/notin.out | 120 +++++++++++++++--- src/test/regress/expected/notin_optimizer.out | 81 ++++++++++++ src/test/regress/expected/qp_subquery.out | 22 ++-- src/test/regress/sql/notin.sql | 23 ++++ 5 files changed, 309 insertions(+), 27 deletions(-) diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c index 4a925ff517..48b74e24a7 100644 --- a/src/backend/cdb/cdbsubselect.c +++ b/src/backend/cdb/cdbsubselect.c @@ -1694,6 +1694,89 @@ find_nonnullable_vars_walker(Node *node, NonNullableVarsContext *context) } +/* + * This function is used to determine whether the parameters of an expression in + * ALL Sublink can be NULL. + */ +static bool +is_param_nullable(Node *node, Query *query, Value *oprname) +{ + bool result = false; + NonNullableVarsContext context; + Expr *expr; + ListCell *lc; + Expr *arg; + + Assert(query); + context.query = query; + context.nonNullableVars = NIL; + + /* Find nullable vars in the jointree */ + expression_tree_walker((Node *) query->jointree, find_nonnullable_vars_walker, &context); + + /* + * A null value "not in / > all / < all" a non-empty set, the result is + * always false, but a null value "not in / > all / < all" a empty set, the + * result is always true. So if the param is nullable, we should not make + * the locus as "Partitioned". + * If the sql is "... a not in (select ...)", the node should be a BoolExpr. + * if the sql is "... a < all (select ...), the node should be a OpExpr" + */ + if (nodeTag(node) == T_BoolExpr) + { + if(((BoolExpr *) node)->boolop != NOT_EXPR) + return false; + expr = lfirst(list_head(((BoolExpr*) node)->args)); + } + else if (nodeTag(node) == T_OpExpr) + { + expr = (Expr *) node; + } + else + return true; + + if (nodeTag(expr) != T_OpExpr) + return true; + + foreach(lc, ((OpExpr*)expr)->args) + { + arg = lfirst(lc); + + if (nodeTag(arg) == T_RelabelType) + arg = ((RelabelType*)arg)->arg; + + if (nodeTag(arg) == T_Param) + continue; + else if (nodeTag(arg) == T_Const) + { + /* + * Is the constant entry in the targetlist null? + */ + Const *constant = (Const *) arg; + + /* + * Note: the 'dummy' column is not NULL, so we don't need any special handling for it + */ + if (constant->constisnull == true) + result = true; + } + else if (nodeTag(arg) == T_Var) + { + Var *var = (Var *) arg; + + /* Was this var determined to be non-nullable? */ + if (!list_member(context.nonNullableVars, var)) + { + result = true; + } + } + else + result = true; + } + + return result; +} + /** * This method determines if the targetlist of a query is nullable. * Consider a query of the form: select t1.x, t2.y from t1, t2 where t1.x > 5 @@ -1818,9 +1901,14 @@ convert_IN_to_antijoin(PlannerInfo *root, List **rtrlist_inout __attribute__((un bool inner_nullable = is_targetlist_nullable(subselect); JoinExpr *join_expr = make_join_expr(larg, subq_indx, JOIN_LASJ_NOTIN); + ListCell *lc = list_head(sublink->operName); + bool outer_nullable = is_param_nullable(sublink->testexpr, + root->parse, + lc? list_head(sublink->operName)->data.ptr_value : NULL); + join_expr->quals = make_lasj_quals(root, sublink, subq_indx); - if (inner_nullable) + if (inner_nullable || outer_nullable) { join_expr->quals = add_null_match_clause(join_expr->quals); } diff --git a/src/test/regress/expected/notin.out b/src/test/regress/expected/notin.out index c6d90b7bde..d9e75346ed 100644 --- a/src/test/regress/expected/notin.out +++ b/src/test/regress/expected/notin.out @@ -107,25 +107,26 @@ select c1 from t1 where c1 not in explain select c1 from t1 where c1 not in (select c2 from t2 where c2 > 2 and c2 not in (select c3 from t3)); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice2; segments: 3) (cost=4.49..7.66 rows=4 width=4) - -> Hash Left Anti Semi Join (Not-In) (cost=4.49..7.66 rows=2 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice3; segments: 3) (cost=4.71..7.89 rows=4 width=4) + -> Hash Left Anti Semi Join (Not-In) (cost=4.71..7.89 rows=2 width=4) Hash Cond: t1.c1 = "NotIn_SUBQUERY".c2 -> Seq Scan on t1 (cost=0.00..3.10 rows=4 width=4) - -> Hash (cost=4.45..4.45 rows=2 width=4) - -> Subquery Scan "NotIn_SUBQUERY" (cost=2.29..4.45 rows=2 width=4) - -> Hash Left Anti Semi Join (Not-In) (cost=2.29..4.41 rows=2 width=4) - Hash Cond: t2.c2 = "NotIn_SUBQUERY".c3 - -> Seq Scan on t2 (cost=0.00..2.06 rows=2 width=4) - Filter: c2 > 2 - -> Hash (cost=2.18..2.18 rows=3 width=4) - -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..2.18 rows=3 width=4) - -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..2.06 rows=1 width=4) - -> Seq Scan on t3 (cost=0.00..2.03 rows=1 width=4) + -> Hash (cost=4.58..4.58 rows=4 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=2.29..4.58 rows=4 width=4) + -> Subquery Scan "NotIn_SUBQUERY" (cost=2.29..4.45 rows=2 width=4) + -> Hash Left Anti Semi Join (Not-In) (cost=2.29..4.41 rows=2 width=4) + Hash Cond: t2.c2 = "NotIn_SUBQUERY".c3 + -> Seq Scan on t2 (cost=0.00..2.06 rows=2 width=4) + Filter: c2 > 2 + -> Hash (cost=2.18..2.18 rows=3 width=4) + -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..2.18 rows=3 width=4) + -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..2.06 rows=1 width=4) + -> Seq Scan on t3 (cost=0.00..2.03 rows=1 width=4) Settings: optimizer=off Optimizer status: legacy query optimizer -(16 rows) +(17 rows) select c1 from t1 where c1 not in (select c2 from t2 where c2 > 2 and c2 not in @@ -1148,7 +1149,7 @@ select c1 from t1 where not not not c1 in (select c2 from t2); --q43 -- explain select c1 from t1 where c1 not in (select c2 from t2 where c2 > 4) and c1 is not null; - QUERY PLAN + QUERY PLAN ------------------------------------------------------------------------------------ Gather Motion 3:1 (slice1; segments: 3) (cost=2.09..5.25 rows=4 width=4) -> Hash Left Anti Semi Join (Not-In) (cost=2.09..5.25 rows=2 width=4) @@ -1192,8 +1193,95 @@ select c1 from t1 where c1 not in (select c2 from t2 where c2 > 4) and c1 > 2; 9 (7 rows) +-- Test the null not in an empty set +-- null not in an unempty set, always returns false +-- null not in an empty set, always returns true +-- +-- q46 +-- +create table table_source (c1 varchar(100),c2 varchar(100),c3 varchar(100),c4 varchar(100)); +insert into table_source (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158',null,'INC','0000000001') ; +create table table_source2 as select * from table_source distributed by (c2); +create table table_source4 (c1 varchar(100),c2 varchar(100) not null,c3 varchar(100),c4 varchar(100)); +insert into table_source4 (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158','a','INC','0000000001') ; +create table table_config (c1 varchar(10) ,c2 varchar(10) ,PRIMARY KEY (c1)); +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "table_config_pkey" for table "table_config" +insert into table_config select i, 'test' from generate_series(1, 1000)i; +delete from table_config where gp_segment_id = 0; +explain select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice2; segments: 3) (cost=103.00..104.12 rows=10 width=258) + -> Hash Left Anti Semi Join (Not-In) (cost=103.00..104.12 rows=4 width=258) + Hash Cond: table_source.c2::text = "NotIn_SUBQUERY".c1::text + -> Seq Scan on table_source (cost=0.00..1.01 rows=1 width=258) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=65.50..65.50 rows=1000 width=38) + -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..65.50 rows=1000 width=38) + -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..25.50 rows=334 width=38) + -> Seq Scan on table_config (cost=0.00..15.50 rows=334 width=3) + Filter: c2::text = 'test'::text + Settings: optimizer=off + Optimizer status: legacy query optimizer +(12 rows) + +select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +----+----+----+---- +(0 rows) + +explain select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice2; segments: 3) (cost=103.00..104.12 rows=10 width=258) + -> Hash Left Anti Semi Join (Not-In) (cost=103.00..104.12 rows=4 width=258) + Hash Cond: table_source2.c2::text = "NotIn_SUBQUERY".c1::text + -> Seq Scan on table_source2 (cost=0.00..1.01 rows=1 width=258) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=65.50..65.50 rows=1000 width=38) + -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..65.50 rows=1000 width=38) + -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..25.50 rows=334 width=38) + -> Seq Scan on table_config (cost=0.00..15.50 rows=334 width=3) + Filter: c2::text = 'test'::text + Settings: optimizer=off + Optimizer status: legacy query optimizer +(12 rows) + +select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +----+----+----+---- +(0 rows) + +explain select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +----------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice2; segments: 3) (cost=38.00..39.14 rows=10 width=42) + -> Hash Left Anti Semi Join (Not-In) (cost=38.00..39.14 rows=4 width=42) + Hash Cond: table_source4.c2::text = "NotIn_SUBQUERY".c1::text + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..1.03 rows=1 width=42) + Hash Key: table_source4.c2 + -> Seq Scan on table_source4 (cost=0.00..1.01 rows=1 width=42) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=25.50..25.50 rows=334 width=38) + -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..25.50 rows=334 width=38) + -> Seq Scan on table_config (cost=0.00..15.50 rows=334 width=3) + Filter: c2::text = 'test'::text + Settings: optimizer=off + Optimizer status: legacy query optimizer +(13 rows) + +select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +--------------------------+----+-----+------------ + 000181202006010000003158 | a | INC | 0000000001 +(1 row) + reset search_path; drop schema notin cascade; +NOTICE: drop cascades to table notin.table_config +NOTICE: drop cascades to table notin.table_source4 +NOTICE: drop cascades to table notin.table_source2 +NOTICE: drop cascades to table notin.table_source NOTICE: drop cascades to table notin.l1 NOTICE: drop cascades to table notin.g1 NOTICE: drop cascades to table notin.t1n diff --git a/src/test/regress/expected/notin_optimizer.out b/src/test/regress/expected/notin_optimizer.out index daa932e16d..06ca493138 100644 --- a/src/test/regress/expected/notin_optimizer.out +++ b/src/test/regress/expected/notin_optimizer.out @@ -1215,8 +1215,89 @@ select c1 from t1 where c1 not in (select c2 from t2 where c2 > 4) and c1 > 2; 7 (7 rows) +-- Test the null not in an empty set +-- null not in an unempty set, always returns false +-- null not in an empty set, always returns true +-- +-- q46 +-- +create table table_source (c1 varchar(100),c2 varchar(100),c3 varchar(100),c4 varchar(100)); +insert into table_source (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158',null,'INC','0000000001') ; +create table table_source2 as select * from table_source distributed by (c2); +create table table_source4 (c1 varchar(100),c2 varchar(100) not null,c3 varchar(100),c4 varchar(100)); +insert into table_source4 (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158','a','INC','0000000001') ; +create table table_config (c1 varchar(10) ,c2 varchar(10) ,PRIMARY KEY (c1)); +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "table_config_pkey" for table "table_config" +insert into table_config select i, 'test' from generate_series(1, 1000)i; +delete from table_config where gp_segment_id = 0; +explain select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +--------------------------------------------------------------------------------------------- + Hash Left Anti Semi Join (Not-In) (cost=0.00..862.20 rows=1 width=40) + Hash Cond: table_source.c2::text = table_config.c1::text + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=40) + -> Table Scan on table_source (cost=0.00..431.00 rows=1 width=40) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=431.03..431.03 rows=334 width=3) + -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.03 rows=1000 width=3) + -> Table Scan on table_config (cost=0.00..431.02 rows=334 width=3) + Filter: c2::text = 'test'::text + Optimizer status: PQO version 3.112.0 +(10 rows) + +select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +----+----+----+---- +(0 rows) + +explain select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +--------------------------------------------------------------------------------------------- + Hash Left Anti Semi Join (Not-In) (cost=0.00..862.20 rows=1 width=40) + Hash Cond: table_source2.c2::text = table_config.c1::text + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=40) + -> Table Scan on table_source2 (cost=0.00..431.00 rows=1 width=40) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=431.03..431.03 rows=334 width=3) + -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.03 rows=1000 width=3) + -> Table Scan on table_config (cost=0.00..431.02 rows=334 width=3) + Filter: c2::text = 'test'::text + Optimizer status: PQO version 3.112.0 +(10 rows) + +select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +----+----+----+---- +(0 rows) + +explain select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + QUERY PLAN +------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..862.08 rows=1 width=42) + -> Hash Left Anti Semi Join (Not-In) (cost=0.00..862.08 rows=1 width=42) + Hash Cond: table_source4.c2::text = table_config.c1::text + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=42) + Hash Key: table_source4.c2 + -> Table Scan on table_source4 (cost=0.00..431.00 rows=1 width=42) + Filter: c3::text = 'INC'::text AND c4::text = '0000000001'::text + -> Hash (cost=431.02..431.02 rows=334 width=3) + -> Table Scan on table_config (cost=0.00..431.02 rows=334 width=3) + Filter: c2::text = 'test'::text + Optimizer status: PQO version 3.112.0 +(11 rows) + +select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + c1 | c2 | c3 | c4 +--------------------------+----+-----+------------ + 000181202006010000003158 | a | INC | 0000000001 +(1 row) + reset search_path; drop schema notin cascade; +NOTICE: drop cascades to table notin.table_config +NOTICE: drop cascades to table notin.table_source4 +NOTICE: drop cascades to table notin.table_source2 +NOTICE: drop cascades to table notin.table_source NOTICE: drop cascades to table notin.l1 NOTICE: drop cascades to table notin.g1 NOTICE: drop cascades to table notin.t1n diff --git a/src/test/regress/expected/qp_subquery.out b/src/test/regress/expected/qp_subquery.out index ccd7234084..94e8f7fed6 100644 --- a/src/test/regress/expected/qp_subquery.out +++ b/src/test/regress/expected/qp_subquery.out @@ -1230,18 +1230,20 @@ explain delete from TabDel1 where TabDel1.a not in (select a from TabDel3); -- d (11 rows) explain delete from TabDel2 where TabDel2.a not in (select a from TabDel4); -- support this - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------- Delete (slice0; segments: 3) (rows=2 width=10) - -> Hash Left Anti Semi Join (Not-In) (cost=1.03..3.11 rows=2 width=10) - Hash Cond: tabdel2.a = "NotIn_SUBQUERY".a - -> Seq Scan on tabdel2 (cost=0.00..2.03 rows=1 width=14) - Filter: a IS NOT NULL - -> Hash (cost=1.02..1.02 rows=1 width=4) - -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..1.02 rows=1 width=4) - -> Seq Scan on tabdel4 (cost=0.00..1.01 rows=1 width=4) + -> Explicit Redistribute Motion 3:3 (slice2; segments: 3) (cost=1.10..3.18 rows=2 width=10) + -> Hash Left Anti Semi Join (Not-In) (cost=1.10..3.18 rows=2 width=10) + Hash Cond: tabdel2.a = "NotIn_SUBQUERY".a + -> Seq Scan on tabdel2 (cost=0.00..2.03 rows=1 width=14) + -> Hash (cost=1.06..1.06 rows=1 width=4) + -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..1.06 rows=1 width=4) + -> Subquery Scan "NotIn_SUBQUERY" (cost=0.00..1.02 rows=1 width=4) + -> Seq Scan on tabdel4 (cost=0.00..1.01 rows=1 width=4) + Settings: optimizer=off Optimizer status: legacy query optimizer -(9 rows) +(11 rows) -- start_ignore delete from TabDel2 where TabDel2.a not in (select a from TabDel4); diff --git a/src/test/regress/sql/notin.sql b/src/test/regress/sql/notin.sql index 757c4d7f9b..5005dc12f2 100644 --- a/src/test/regress/sql/notin.sql +++ b/src/test/regress/sql/notin.sql @@ -375,5 +375,28 @@ select c1 from t1 where c1 not in (select c2 from t2 where c2 > 4) and c1 is not -- select c1 from t1 where c1 not in (select c2 from t2 where c2 > 4) and c1 > 2; +-- Test the null not in an empty set +-- null not in an unempty set, always returns false +-- null not in an empty set, always returns true +-- +-- q46 +-- +create table table_source (c1 varchar(100),c2 varchar(100),c3 varchar(100),c4 varchar(100)); +insert into table_source (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158',null,'INC','0000000001') ; +create table table_source2 as select * from table_source distributed by (c2); +create table table_source4 (c1 varchar(100),c2 varchar(100) not null,c3 varchar(100),c4 varchar(100)); +insert into table_source4 (c1 ,c2 ,c3 ,c4 ) values ('000181202006010000003158','a','INC','0000000001') ; +create table table_config (c1 varchar(10) ,c2 varchar(10) ,PRIMARY KEY (c1)); +insert into table_config select i, 'test' from generate_series(1, 1000)i; +delete from table_config where gp_segment_id = 0; + +explain select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); +select * from table_source where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); +explain select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); +select * from table_source2 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); +explain select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); +select * from table_source4 where c3 = 'INC' and c4 = '0000000001' and c2 not in (SELECT c1 from table_config where c2='test'); + + reset search_path; drop schema notin cascade; -- GitLab