-- A classic motion hazard / deadlock is between the inner and outer motions,
-- but it is also possible to happen between the outer and joinqual / subplan
-- motions.  A sample plan is as below:
--
--  Gather Motion 3:1  (slice4; segments: 3)
--    ->  Hash Left Join
--          Hash Cond: (t_outer.c2 = t_inner.c2)
--          Join Filter: (NOT (SubPlan))
--          ->  Redistribute Motion 3:3  (slice1; segments: 3)
--                Hash Key: t_outer.c2
--                ->  Seq Scan on t_outer
--          ->  Hash
--                ->  Redistribute Motion 3:3  (slice2; segments: 3)
--                      Hash Key: t_inner.c2
--                      ->  Seq Scan on t_inner
--          SubPlan 1  (slice4; segments: 3)
--            ->  Result
--                  Filter: (t_subplan.c2 = t_outer.c1)
--                  ->  Materialize
--                        ->  Broadcast Motion 3:3  (slice3; segments: 3)
--                              ->  Seq Scan on t_subplan
-- Suppose :x0 is distributed on seg0, it does not matter if it is not.
-- This assumption is only to simplify the explanation.
\set x0 1
\set scale 10000
drop schema if exists deadlock2 cascade;
NOTICE:  schema "deadlock2" does not exist, skipping
create schema deadlock2;
set search_path = deadlock2;
create table t_inner (c1 int, c2 int);
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table t_outer (c1 int, c2 int);
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table t_subplan (c1 int, c2 int);
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-- First load enough data on all the relations to generate redistribute-both
-- motion instead of broadcast-one motion.
insert into t_inner select i, i from generate_series(1,:scale) i;
insert into t_outer select i, i from generate_series(1,:scale) i;
insert into t_subplan select i, i from generate_series(1,:scale) i;
analyze t_inner;
analyze t_outer;
analyze t_subplan;
-- Then delete all of them and load the real data, do not use TRUNCATE as it
-- will clear the analyze information.
delete from t_inner;
delete from t_outer;
delete from t_subplan;
-- t_inner is the inner relation, it does not need much data as long as it is
-- not empty.
insert into t_inner values (:x0, :x0);
-- t_outer is the outer relation of the hash join, all its data are on seg0,
-- and redistributes to seg0, so outer@slice1@seg0 sends data to
-- hashjoin@slice4@seg0 and waits for ACK from it.  After all the data are sent
-- it will send EOS to all the segments of hashjoin@slice4.  So once
-- hashjoin@slice4@seg1 reads from outer@slice1 it has to wait for EOS.
insert into t_outer select :x0, :x0 from generate_series(1,:scale) i;
-- t_subplan is the subplan relation of the hash join, all its data are on
-- seg0, and broadcasts to seg0 and seg1.  When subplan@slice3@seg0 sends data
-- to hashjoin@slice4@seg1 it has to wait for ACK from it, this can happen
-- before hashjoin@slice4@seg1 reading from outer@slice1.
insert into t_subplan select :x0, :x0 from generate_series(1,:scale) i;
-- In the past hash join do the job like this:
--
-- 10. read all the inner tuples and build the hash table;
-- 20. read an outer tuple;
-- 30. pass the outer tuple to the join qual, reads through the motion in the
--     subplan;
-- 40. if there are more outer tuples goto 20.
--
-- So this makes it possible that hashjoin@slice4@seg0 is reading from
-- subplan@slice3@seg0 while itself is being waited by outer@slice1@seg0, then
-- it forms a deadlock:
--
--      outer@slice1@seg0   --[waits for ACK]->  hashjoin@slice4@seg0
--              ^                                         |
--              |                                         |
--    [waits for outer tuple]                 [waits for subplan tuple]
--              |                                         |
--              |                                         v
--     hashjoin@slice4@seg1  <-[wait for ACK]--  subplan@slice3@seg0
--
-- This deadlock is prevented by prefetching the subplan.
-- In theory this deadlock exists in all of hash join, merge join and nestloop
-- join, but so far we have only constructed a reproducer for hash join.
set enable_hashjoin to on;
set enable_mergejoin to off;
set enable_nestloop to off;
select count(*) from t_inner right join t_outer on t_inner.c2=t_outer.c2
   and not exists (select 0 from t_subplan where t_subplan.c2=t_outer.c1);
 count 
-------
 10000
(1 row)