-- test numeric hash join set enable_hashjoin to on; set enable_mergejoin to off; set enable_nestloop to off; create table nhtest (i numeric(10, 2)) distributed by (i); insert into nhtest values(100000.22); insert into nhtest values(300000.19); explain select * from nhtest a join nhtest b using (i); QUERY PLAN ---------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=1 width=11) -> Hash Join (cost=0.00..862.00 rows=1 width=11) Hash Cond: public.nhtest.i = public.nhtest.i -> Table Scan on nhtest (cost=0.00..431.00 rows=1 width=11) -> Hash (cost=431.00..431.00 rows=1 width=11) -> Table Scan on nhtest (cost=0.00..431.00 rows=1 width=11) Settings: enable_hashjoin=on; enable_mergejoin=off; enable_nestloop=off; optimizer=on Optimizer status: PQO version 2.39.2 (8 rows) select * from nhtest a join nhtest b using (i); i ----------- 100000.22 300000.19 (2 rows) create temp table l(a int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. insert into l values (1), (1), (2); select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and l1.a = 2 order by 1,2,3; a | a | a ---+---+--- 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 (5 rows) -- -- test hash join -- create table hjtest (i int, j int) distributed by (i,j); insert into hjtest values(3, 4); select count(*) from hjtest a1, hjtest a2 where a2.i = least (a1.i,4) and a2.j = 4; count ------- 1 (1 row) -- -- Test for correct behavior when there is a Merge Join on top of Materialize -- on top of a Motion : -- 1. Use FULL OUTER JOIN to induce a Merge Join -- 2. Use a large tuple size to induce a Materialize -- 3. Use gp_dist_random() to induce a Redistribute -- set enable_hashjoin to off; set enable_mergejoin to on; set enable_nestloop to off; DROP TABLE IF EXISTS alpha; NOTICE: table "alpha" does not exist, skipping DROP TABLE IF EXISTS theta; NOTICE: table "theta" does not exist, skipping CREATE TABLE alpha (i int, j int) distributed by (i); CREATE TABLE theta (i int, j char(10000000)) distributed by (i); INSERT INTO alpha values (1, 1), (2, 2); INSERT INTO theta values (1, 'f'), (2, 'g'); SELECT * FROM gp_dist_random('alpha') FULL OUTER JOIN gp_dist_random('theta') ON (alpha.i = theta.i) WHERE (alpha.j IS NULL or theta.j IS NULL); i | j | i | j ---+---+---+--- (0 rows) reset enable_hashjoin; reset enable_mergejoin; reset enable_nestloop; -- -- Predicate propagation over equality conditions -- drop schema if exists pred; NOTICE: schema "pred" does not exist, skipping create schema pred; set search_path=pred; create table t1 (x int, y int, z int) distributed by (y); create table t2 (x int, y int, z int) distributed by (x); insert into t1 select i, i, i from generate_series(1,100) i; insert into t2 select * from t1; analyze t1; analyze t2; -- -- infer over equalities -- explain select count(*) from t1,t2 where t1.x = 100 and t1.x = t2.x; QUERY PLAN ------------------------------------------------------------------------------------------------------------ Aggregate (cost=0.00..862.00 rows=1 width=8) -> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..862.00 rows=1 width=8) -> Aggregate (cost=0.00..862.00 rows=1 width=8) -> Hash Join (cost=0.00..862.00 rows=1 width=1) Hash Cond: t1.x = t2.x -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=4) Hash Key: t1.x -> Table Scan on t1 (cost=0.00..431.00 rows=1 width=4) Filter: x = 100 -> Hash (cost=431.00..431.00 rows=1 width=4) -> Table Scan on t2 (cost=0.00..431.00 rows=1 width=4) Filter: x = 100 Settings: optimizer=on Optimizer status: PQO version 2.39.2 (14 rows) select count(*) from t1,t2 where t1.x = 100 and t1.x = t2.x; count ------- 1 (1 row) -- -- infer over >= -- explain select * from t1,t2 where t1.x = 100 and t2.x >= t1.x; QUERY PLAN ---------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..1324039.56 rows=34 width=24) -> Nested Loop (cost=0.00..1324039.56 rows=12 width=24) Join Filter: t2.x >= t1.x -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=12) -> Table Scan on t1 (cost=0.00..431.00 rows=1 width=12) Filter: x = 100 -> Table Scan on t2 (cost=0.00..431.00 rows=34 width=12) Settings: optimizer=on Optimizer status: PQO version 2.39.2 (9 rows) select * from t1,t2 where t1.x = 100 and t2.x >= t1.x; x | y | z | x | y | z -----+-----+-----+-----+-----+----- 100 | 100 | 100 | 100 | 100 | 100 (1 row) -- -- multiple inferences -- set optimizer_segments=2; explain select * from t1,t2 where t1.x = 100 and t1.x = t2.y and t1.x <= t2.x; QUERY PLAN ------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..862.01 rows=1 width=24) -> Hash Join (cost=0.00..862.01 rows=1 width=24) Hash Cond: t1.x = t2.y Join Filter: t1.x <= t2.x -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=12) Hash Key: t1.x -> Table Scan on t1 (cost=0.00..431.00 rows=1 width=12) Filter: x = 100 -> Hash (cost=431.00..431.00 rows=1 width=12) -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=12) Hash Key: t2.y -> Table Scan on t2 (cost=0.00..431.00 rows=1 width=12) Filter: y = 100 Settings: optimizer=on; optimizer_segments=2 Optimizer status: PQO version 2.39.2 (15 rows) reset optimizer_segments; select * from t1,t2 where t1.x = 100 and t1.x = t2.y and t1.x <= t2.x; x | y | z | x | y | z -----+-----+-----+-----+-----+----- 100 | 100 | 100 | 100 | 100 | 100 (1 row) -- -- MPP-18537: hash clause references a constant in outer child target list -- create table hjn_test (i int, j int) distributed by (i,j); insert into hjn_test values(3, 4); create table int4_tbl (f1 int) distributed by (f1); insert into int4_tbl values(123456), (-2147483647), (0), (-123456), (2147483647); select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,4) and hjn_test.j = 4; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,(array[4])[1]) and hjn_test.j = (array[4])[1]; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where least (foo.bar,(array[4])[1]) = hjn_test.i and hjn_test.j = (array[4])[1]; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar, least(4,10)) and hjn_test.j = least(4,10); count ------- 1 (1 row) select * from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)); f1 | f1 -------------+------------- 2147483647 | 2147483647 123456 | 123456 -2147483647 | -2147483647 0 | 0 -123456 | -123456 (5 rows) -- Same as the last query, but with a partitioned table (which requires a -- Result node to do projection of the hash expression, as Append is not -- projection-capable) create table part4_tbl (f1 int4) partition by range (f1) (start(-1000000) end (1000000) every (1000000)); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Greenplum Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. NOTICE: CREATE TABLE will create partition "part4_tbl_1_prt_1" for table "part4_tbl" NOTICE: CREATE TABLE will create partition "part4_tbl_1_prt_2" for table "part4_tbl" insert into part4_tbl values (-123457), (-123456), (-123455), (-1), (0), (1), (123455), (123456), (123457); select * from part4_tbl a join part4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)); f1 | f1 ---------+--------- 0 | 0 123456 | 123456 -123456 | -123456 (3 rows) -- -- Test case where a Motion hash key is only needed for the redistribution, -- and not returned in the final result set. There was a bug at one point where -- tjoin.c1 was used as the hash key in a Motion node, but it was not added -- to the sub-plans target list, causing a "variable not found in subplan -- target list" error. -- create table tjoin1(dk integer, id integer) distributed by (dk); create table tjoin2(dk integer, id integer, t text) distributed by (dk); create table tjoin3(dk integer, id integer, t text) distributed by (dk); insert into tjoin1 values (1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3); insert into tjoin2 values (1, 1, '1-1'), (1, 2, '1-2'), (2, 1, '2-1'), (2, 2, '2-2'); insert into tjoin3 values (1, 1, '1-1'), (2, 1, '2-1'); select tjoin1.id, tjoin2.t, tjoin3.t from tjoin1 left outer join (tjoin2 left outer join tjoin3 on tjoin2.id=tjoin3.id) on tjoin1.id=tjoin3.id; id | t | t ----+-----+----- 3 | | 3 | | 1 | 2-1 | 1-1 1 | 2-1 | 2-1 1 | 1-1 | 1-1 1 | 1-1 | 2-1 2 | | 1 | 2-1 | 1-1 1 | 2-1 | 2-1 1 | 1-1 | 1-1 1 | 1-1 | 2-1 2 | | (12 rows) set enable_hashjoin to off; set optimizer_enable_hashjoin = off; select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,4) and hjn_test.j = 4; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,(array[4])[1]) and hjn_test.j = (array[4])[1]; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where least (foo.bar,(array[4])[1]) = hjn_test.i and hjn_test.j = (array[4])[1]; count ------- 1 (1 row) select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar, least(4,10)) and hjn_test.j = least(4,10); count ------- 1 (1 row) select * from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1)); f1 | f1 -------------+------------- 2147483647 | 2147483647 123456 | 123456 -2147483647 | -2147483647 0 | 0 -123456 | -123456 (5 rows) reset enable_hashjoin; reset optimizer_enable_hashjoin; -- In case of Left Anti Semi Join, if the left rel is empty a dummy join -- should be created drop table if exists foo; NOTICE: table "foo" does not exist, skipping drop table if exists bar; NOTICE: table "bar" does not exist, skipping create table foo (a int, b int) distributed randomly; create table bar (c int, d int) distributed randomly; insert into foo select generate_series(1,10); insert into bar select generate_series(1,10); explain select a from foo where a<1 and a>1 and not exists (select c from bar where c=a); QUERY PLAN ------------------------------------------------ Result (cost=0.00..0.00 rows=0 width=4) -> Result (cost=0.00..0.00 rows=0 width=4) One-Time Filter: false Settings: optimizer=on Optimizer status: PQO version 2.39.2 (5 rows) select a from foo where a<1 and a>1 and not exists (select c from bar where c=a); a --- (0 rows) -- The merge join executor code doesn't support LASJ_NOTIN joins. Make sure -- the planner doesn't create an invalid plan with them. create index index_foo on foo (a); create index index_bar on bar (c); set enable_nestloop to off; set enable_hashjoin to off; set enable_mergejoin to on; select * from foo where a not in (select c from bar where c <= 5); a | b ----+--- 7 | 6 | 8 | 9 | 10 | (5 rows) set enable_nestloop to off; set enable_hashjoin to on; set enable_mergejoin to off; create table dept ( id int, pid int, name char(40) ); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Greenplum Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. insert into dept values(3, 0, 'root'); insert into dept values(4, 3, '2<-1'); insert into dept values(5, 4, '3<-2<-1'); insert into dept values(6, 4, '4<-2<-1'); insert into dept values(7, 3, '5<-1'); insert into dept values(8, 7, '5<-1'); insert into dept select i, i % 6 + 3 from generate_series(9,50) as i; insert into dept select i, 99 from generate_series(100,15000) as i; ANALYZE dept; -- Test rescannable hashjoin with spilling hashtable for buffile set statement_mem='1000kB'; set gp_workfile_type_hashjoin=buffile; WITH RECURSIVE subdept(id, parent_department, name) AS ( -- non recursive term SELECT * FROM dept WHERE name = 'root' UNION ALL -- recursive term SELECT d.* FROM dept AS d, subdept AS sd WHERE d.pid = sd.id ) SELECT count(*) FROM subdept; count ------- 48 (1 row) -- Test rescannable hashjoin with spilling hashtable for bfz set gp_workfile_type_hashjoin=bfz; WITH RECURSIVE subdept(id, parent_department, name) AS ( -- non recursive term SELECT * FROM dept WHERE name = 'root' UNION ALL -- recursive term SELECT d.* FROM dept AS d, subdept AS sd WHERE d.pid = sd.id ) SELECT count(*) FROM subdept; count ------- 48 (1 row) -- Test rescannable hashjoin with in-memory hashtable reset statement_mem; WITH RECURSIVE subdept(id, parent_department, name) AS ( -- non recursive term SELECT * FROM dept WHERE name = 'root' UNION ALL -- recursive term SELECT d.* FROM dept AS d, subdept AS sd WHERE d.pid = sd.id ) SELECT count(*) FROM subdept; count ------- 48 (1 row) -- MPP-29458 -- When we join on a clause with two different types. If one table distribute by one type, the query plan -- will redistribute data on another type. But the has values of two types would not be equal. The data will -- redistribute to wrong segments. create table test_timestamp_t1 (id numeric(10,0) ,field_dt date) distributed by (id); create table test_timestamp_t2 (id numeric(10,0),field_tms timestamp without time zone) distributed by (id,field_tms); insert into test_timestamp_t1 values(10 ,'2018-1-10'); insert into test_timestamp_t1 values(11 ,'2018-1-11'); insert into test_timestamp_t2 values(10 ,'2018-1-10'::timestamp); insert into test_timestamp_t2 values(11 ,'2018-1-11'::timestamp); -- Test nest loop redistribute keys set enable_nestloop to on; set enable_hashjoin to on; set enable_mergejoin to on; select count(*) from test_timestamp_t1 t1 ,test_timestamp_t2 t2 where T1.id = T2.id and T1.field_dt = t2.field_tms; count ------- 2 (1 row) -- Test hash join redistribute keys set enable_nestloop to off; set enable_hashjoin to on; set enable_mergejoin to on; select count(*) from test_timestamp_t1 t1 ,test_timestamp_t2 t2 where T1.id = T2.id and T1.field_dt = t2.field_tms; count ------- 2 (1 row) drop table test_timestamp_t1; drop table test_timestamp_t2; -- Test merge join redistribute keys create table test_timestamp_t1 (id numeric(10,0) ,field_dt date) distributed randomly; create table test_timestamp_t2 (id numeric(10,0),field_tms timestamp without time zone) distributed by (field_tms); insert into test_timestamp_t1 values(10 ,'2018-1-10'); insert into test_timestamp_t1 values(11 ,'2018-1-11'); insert into test_timestamp_t2 values(10 ,'2018-1-10'::timestamp); insert into test_timestamp_t2 values(11 ,'2018-1-11'::timestamp); select * from test_timestamp_t1 t1 full outer join test_timestamp_t2 t2 on T1.id = T2.id and T1.field_dt = t2.field_tms; id | field_dt | id | field_tms ----+------------+----+-------------------------- 10 | 01-10-2018 | 10 | Wed Jan 10 00:00:00 2018 11 | 01-11-2018 | 11 | Thu Jan 11 00:00:00 2018 (2 rows) -- test float type set enable_nestloop to off; set enable_hashjoin to on; set enable_mergejoin to on; create table test_float1(id int, data float4) DISTRIBUTED BY (data); create table test_float2(id int, data float8) DISTRIBUTED BY (data); insert into test_float1 values(1, 10), (2, 20); insert into test_float2 values(3, 10), (4, 20); select t1.id, t1.data, t2.id, t2.data from test_float1 t1, test_float2 t2 where t1.data = t2.data; id | data | id | data ----+------+----+------ 2 | 20 | 4 | 20 1 | 10 | 3 | 10 (2 rows) -- test int type create table test_int1(id int, data int4) DISTRIBUTED BY (data); create table test_int2(id int, data int8) DISTRIBUTED BY (data); insert into test_int1 values(1, 10), (2, 20); insert into test_int2 values(3, 10), (4, 20); select t1.id, t1.data, t2.id, t2.data from test_int1 t1, test_int2 t2 where t1.data = t2.data; id | data | id | data ----+------+----+------ 1 | 10 | 3 | 10 2 | 20 | 4 | 20 (2 rows) -- Cleanup set client_min_messages='warning'; -- silence drop-cascade NOTICEs drop schema pred cascade;