join_gp_optimizer.out 12.6 KB
Newer Older
1 2 3 4 5 6 7 8
-- test numeric hash join
set enable_hashjoin to on;
set enable_mergejoin to off;
set enable_nestloop to off;
create table nhtest (i numeric(10, 2)) distributed by (i);
insert into nhtest values(100000.22);
insert into nhtest values(300000.19);
explain select * from nhtest a join nhtest b using (i);
9 10 11
                                       QUERY PLAN                                       
----------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.00 rows=1 width=11)
12 13 14 15 16
   ->  Hash Join  (cost=0.00..862.00 rows=1 width=11)
         Hash Cond: public.nhtest.i = public.nhtest.i
         ->  Table Scan on nhtest  (cost=0.00..431.00 rows=1 width=11)
         ->  Hash  (cost=431.00..431.00 rows=1 width=11)
               ->  Table Scan on nhtest  (cost=0.00..431.00 rows=1 width=11)
17 18 19
 Settings:  enable_hashjoin=on; enable_mergejoin=off; enable_nestloop=off; optimizer=on
 Optimizer status: PQO version 2.39.2
(8 rows)
20 21 22 23 24

select * from nhtest a join nhtest b using (i);
     i     
-----------
 100000.22
25
 300000.19
26 27 28
(2 rows)

create temp table l(a int);
29 30
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
insert into l values (1), (1), (2);
select * from l l1 join l l2 on l1.a = l2.a left join l l3 on l1.a = l3.a and l1.a = 2 order by 1,2,3;
 a | a | a 
---+---+---
 1 | 1 |  
 1 | 1 |  
 1 | 1 |  
 1 | 1 |  
 2 | 2 | 2
(5 rows)

--
-- test hash join
--
create table hjtest (i int, j int) distributed by (i,j);
insert into hjtest values(3, 4);
select count(*) from hjtest a1, hjtest a2 where a2.i = least (a1.i,4) and a2.j = 4;
 count 
-------
     1
(1 row)

53 54 55 56 57 58
--
-- Test for correct behavior when there is a Merge Join on top of Materialize
-- on top of a Motion :
-- 1. Use FULL OUTER JOIN to induce a Merge Join
-- 2. Use a large tuple size to induce a Materialize
-- 3. Use gp_dist_random() to induce a Redistribute
59 60 61 62
--
set enable_hashjoin to off;
set enable_mergejoin to on;
set enable_nestloop to off;
63
DROP TABLE IF EXISTS alpha;
64
NOTICE:  table "alpha" does not exist, skipping
65
DROP TABLE IF EXISTS theta;
66 67 68
NOTICE:  table "theta" does not exist, skipping
CREATE TABLE alpha (i int, j int) distributed by (i);
CREATE TABLE theta (i int, j char(10000000)) distributed by (i);
69 70 71
INSERT INTO alpha values (1, 1), (2, 2);
INSERT INTO theta values (1, 'f'), (2, 'g');
SELECT *
72 73
FROM gp_dist_random('alpha') FULL OUTER JOIN gp_dist_random('theta')
  ON (alpha.i = theta.i)
74
WHERE (alpha.j IS NULL or theta.j IS NULL);
75 76
 i | j | i | j 
---+---+---+---
77 78 79 80 81
(0 rows)

reset enable_hashjoin;
reset enable_mergejoin;
reset enable_nestloop;
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
--
-- Predicate propagation over equality conditions
--
drop schema if exists pred;
NOTICE:  schema "pred" does not exist, skipping
create schema pred;
set search_path=pred;
create table t1 (x int, y int, z int) distributed by (y);
create table t2 (x int, y int, z int) distributed by (x);
insert into t1 select i, i, i from generate_series(1,100) i;
insert into t2 select * from t1;
analyze t1;
analyze t2;
--
-- infer over equalities
--
explain select count(*) from t1,t2 where t1.x = 100 and t1.x = t2.x;
99 100 101 102 103 104
                                                 QUERY PLAN                                                 
------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=0.00..862.00 rows=1 width=8)
   ->  Gather Motion 3:1  (slice2; segments: 3)  (cost=0.00..862.00 rows=1 width=8)
         ->  Aggregate  (cost=0.00..862.00 rows=1 width=8)
               ->  Hash Join  (cost=0.00..862.00 rows=1 width=1)
105
                     Hash Cond: t1.x = t2.x
106
                     ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=4)
107 108 109 110 111 112
                           Hash Key: t1.x
                           ->  Table Scan on t1  (cost=0.00..431.00 rows=1 width=4)
                                 Filter: x = 100
                     ->  Hash  (cost=431.00..431.00 rows=1 width=4)
                           ->  Table Scan on t2  (cost=0.00..431.00 rows=1 width=4)
                                 Filter: x = 100
113 114 115
 Settings:  optimizer=on
 Optimizer status: PQO version 2.39.2
(14 rows)
116 117 118 119 120 121 122 123 124 125 126

select count(*) from t1,t2 where t1.x = 100 and t1.x = t2.x;
 count 
-------
     1
(1 row)

--
-- infer over >=
--
explain select * from t1,t2 where t1.x = 100 and t2.x >= t1.x;
127 128 129 130
                                          QUERY PLAN                                          
----------------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice2; segments: 3)  (cost=0.00..1324039.56 rows=34 width=24)
   ->  Nested Loop  (cost=0.00..1324039.56 rows=12 width=24)
131
         Join Filter: t2.x >= t1.x
132
         ->  Broadcast Motion 3:3  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=12)
133 134
               ->  Table Scan on t1  (cost=0.00..431.00 rows=1 width=12)
                     Filter: x = 100
135 136 137 138
         ->  Table Scan on t2  (cost=0.00..431.00 rows=34 width=12)
 Settings:  optimizer=on
 Optimizer status: PQO version 2.39.2
(9 rows)
139 140 141 142 143 144 145 146 147 148 149 150

select * from t1,t2 where t1.x = 100 and t2.x >= t1.x;
  x  |  y  |  z  |  x  |  y  |  z  
-----+-----+-----+-----+-----+-----
 100 | 100 | 100 | 100 | 100 | 100
(1 row)

--
-- multiple inferences
--
set optimizer_segments=2;
explain select * from t1,t2 where t1.x = 100 and t1.x = t2.y and t1.x <= t2.x;
151 152 153
                                              QUERY PLAN                                               
-------------------------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice3; segments: 3)  (cost=0.00..862.01 rows=1 width=24)
154 155 156
   ->  Hash Join  (cost=0.00..862.01 rows=1 width=24)
         Hash Cond: t1.x = t2.y
         Join Filter: t1.x <= t2.x
157
         ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=12)
158 159 160 161
               Hash Key: t1.x
               ->  Table Scan on t1  (cost=0.00..431.00 rows=1 width=12)
                     Filter: x = 100
         ->  Hash  (cost=431.00..431.00 rows=1 width=12)
162
               ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=1 width=12)
163 164 165
                     Hash Key: t2.y
                     ->  Table Scan on t2  (cost=0.00..431.00 rows=1 width=12)
                           Filter: y = 100
166 167 168
 Settings:  optimizer=on; optimizer_segments=2
 Optimizer status: PQO version 2.39.2
(15 rows)
169 170 171 172 173 174 175 176 177 178 179 180 181

reset optimizer_segments;
select * from t1,t2 where t1.x = 100 and t1.x = t2.y and t1.x <= t2.x;
  x  |  y  |  z  |  x  |  y  |  z  
-----+-----+-----+-----+-----+-----
 100 | 100 | 100 | 100 | 100 | 100
(1 row)

--
-- MPP-18537: hash clause references a constant in outer child target list
--
create table hjn_test (i int, j int) distributed by (i,j);
insert into hjn_test values(3, 4);
182
create table int4_tbl (f1 int) distributed by (f1);
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
insert into int4_tbl values(123456), (-2147483647), (0), (-123456), (2147483647);
select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,4) and hjn_test.j = 4;
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,(array[4])[1]) and hjn_test.j = (array[4])[1];
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where least (foo.bar,(array[4])[1]) = hjn_test.i and hjn_test.j = (array[4])[1];
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar, least(4,10)) and hjn_test.j = least(4,10);
 count 
-------
     1
(1 row)

select * from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1));
     f1      |     f1      
-------------+-------------
211
  2147483647 |  2147483647
212 213 214 215 216 217
      123456 |      123456
 -2147483647 | -2147483647
           0 |           0
     -123456 |     -123456
(5 rows)

218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
-- Same as the last query, but with a partitioned table (which requires a
-- Result node to do projection of the hash expression, as Append is not
-- projection-capable)
create table part4_tbl (f1 int4) partition by range (f1) (start(-1000000) end (1000000) every (1000000));
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
NOTICE:  CREATE TABLE will create partition "part4_tbl_1_prt_1" for table "part4_tbl"
NOTICE:  CREATE TABLE will create partition "part4_tbl_1_prt_2" for table "part4_tbl"
insert into part4_tbl values
       (-123457), (-123456), (-123455),
       (-1), (0), (1),
       (123455), (123456), (123457);
select * from part4_tbl a join part4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1));
   f1    |   f1    
---------+---------
233
       0 |       0
234 235 236 237
  123456 |  123456
 -123456 | -123456
(3 rows)

238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
--
-- Test case where a Motion hash key is only needed for the redistribution,
-- and not returned in the final result set. There was a bug at one point where
-- tjoin.c1 was used as the hash key in a Motion node, but it was not added
-- to the sub-plans target list, causing a "variable not found in subplan
-- target list" error.
--
create table tjoin1(dk integer, id integer) distributed by (dk);
create table tjoin2(dk integer, id integer, t text) distributed by (dk);
create table tjoin3(dk integer, id integer, t text) distributed by (dk);
insert into tjoin1 values (1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3);
insert into tjoin2 values (1, 1, '1-1'), (1, 2, '1-2'), (2, 1, '2-1'), (2, 2, '2-2');
insert into tjoin3 values (1, 1, '1-1'), (2, 1, '2-1');
select tjoin1.id, tjoin2.t, tjoin3.t
from tjoin1
left outer join (tjoin2 left outer join tjoin3 on tjoin2.id=tjoin3.id) on tjoin1.id=tjoin3.id;
 id |  t  |  t  
----+-----+-----
  3 |     | 
257
  3 |     | 
258
  1 | 2-1 | 1-1
259
  1 | 2-1 | 2-1
260
  1 | 1-1 | 1-1
261
  1 | 1-1 | 2-1
262
  2 |     | 
263 264 265 266
  1 | 2-1 | 1-1
  1 | 2-1 | 2-1
  1 | 1-1 | 1-1
  1 | 1-1 | 2-1
267 268 269
  2 |     | 
(12 rows)

270
set enable_hashjoin to off;
271
set optimizer_enable_hashjoin = off;
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,4) and hjn_test.j = 4;
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar,(array[4])[1]) and hjn_test.j = (array[4])[1];
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where least (foo.bar,(array[4])[1]) = hjn_test.i and hjn_test.j = (array[4])[1];
 count 
-------
     1
(1 row)

select count(*) from hjn_test, (select 3 as bar) foo where hjn_test.i = least (foo.bar, least(4,10)) and hjn_test.j = least(4,10);
 count 
-------
     1
(1 row)

select * from int4_tbl a join int4_tbl b on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1));
     f1      |     f1      
-------------+-------------
299
  2147483647 |  2147483647
300 301 302 303 304 305 306
      123456 |      123456
 -2147483647 | -2147483647
           0 |           0
     -123456 |     -123456
(5 rows)

reset enable_hashjoin;
307
reset optimizer_enable_hashjoin;
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
-- In case of Left Anti Semi Join, if the left rel is empty a dummy join
-- should be created
drop table if exists foo;
NOTICE:  table "foo" does not exist, skipping
drop table if exists bar;
NOTICE:  table "bar" does not exist, skipping
create table foo (a int, b int) distributed randomly;
create table bar (c int, d int) distributed randomly;
insert into foo select generate_series(1,10);
insert into bar select generate_series(1,10);
explain select a from foo where a<1 and a>1 and not exists (select c from bar where c=a);
                   QUERY PLAN                   
------------------------------------------------
 Result  (cost=0.00..0.00 rows=0 width=4)
   ->  Result  (cost=0.00..0.00 rows=0 width=4)
         One-Time Filter: false
 Settings:  optimizer=on
325
 Optimizer status: PQO version 2.39.2
326 327 328 329 330 331 332
(5 rows)

select a from foo where a<1 and a>1 and not exists (select c from bar where c=a);
 a 
---
(0 rows)

333 334 335 336 337 338 339 340 341 342 343
-- The merge join executor code doesn't support LASJ_NOTIN joins. Make sure
-- the planner doesn't create an invalid plan with them.
create index index_foo on foo (a);
create index index_bar on bar (c);
set enable_nestloop to off;
set enable_hashjoin to off;
set enable_mergejoin to on;
select * from foo where a not in (select c from bar where c <= 5);
 a  | b 
----+---
  7 |  
344
  6 |  
345 346 347 348 349
  8 |  
  9 |  
 10 |  
(5 rows)

350
-- Cleanup
351
set client_min_messages='warning'; -- silence drop-cascade NOTICEs
352
drop schema pred cascade;