From c7fce8448eb0e4b0fee78d1185893954e99135a1 Mon Sep 17 00:00:00 2001 From: Alexandra Wang Date: Fri, 22 Mar 2019 11:48:33 -0700 Subject: [PATCH] Lock child partition tables at INSERT InitPlan() on QD INSERT needs to Lock the child table on QD, otherwise the following dead lock scenario may happen if INSERT runs concurrently with VACUUM drop phase on an AppendOnly partitioned table. 1. VACUUM on QD: acquired AccessExclusiveLock on the child table 2. INSERT on QE: acquired RowExclusiveLock on the child table 3. VACUUM on QE: waiting for AccessExclusiveLock on the child table 4. INSERT on QD: waiting for AccessShareLock at ExecutorEnd() on the child table, this is after QE sends back which child it inserted Note that in step 2, INSERT only locks the child table on QE, it does not lock the child table on QD in the previous code. This patch adds the lock on QD as well to prevent the above dead lock. Added `insert_while_vacuum_drop` and updated `partition_locking` to reflect the changes --- src/backend/commands/vacuum.c | 2 +- src/backend/executor/execMain.c | 30 ++++++++- .../input/uao/insert_while_vacuum_drop.source | 55 +++++++++++++++++ .../uao/insert_while_vacuum_drop.source | 61 +++++++++++++++++++ .../regress/expected/partition_locking.out | 22 +++++-- .../expected/partition_locking_optimizer.out | 22 +++++-- 6 files changed, 180 insertions(+), 12 deletions(-) create mode 100644 src/test/isolation2/input/uao/insert_while_vacuum_drop.source create mode 100644 src/test/isolation2/output/uao/insert_while_vacuum_drop.source diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 6bd6680df8..08bc07442e 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -675,7 +675,7 @@ vacuumStatement_Relation(VacuumStmt *vacstmt, Oid relid, { Assert(Gp_role == GP_ROLE_EXECUTE); lmode = AccessExclusiveLock; - + SIMPLE_FAULT_INJECTOR(VacuumRelationOpenRelationDuringDropPhase); } else if (!(vacstmt->options & VACOPT_VACUUM)) lmode = ShareUpdateExclusiveLock; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index d3b9f1b090..dad72e528e 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1759,7 +1759,35 @@ InitPlan(QueryDesc *queryDesc, int eflags) * get each element's relation oid. */ if (containRoot) - all_relids = find_all_inheritors(relid, NoLock, NULL); + { + /* + * For partition table, INSERT plan only contains the root table + * in the result relations, whereas DELETE and UPDATE contain + * both root table and the partition tables. + * + * Without locking the partition relations on QD when INSERT + * the following dead lock scenario may happen between INSERT and + * AppendOnly VACUUM drop phase on the partition table: + * + * 1. AO VACUUM drop on QD: acquired AccessExclusiveLock + * 2. INSERT on QE: acquired RowExclusiveLock + * 3. AO VACUUM drop on QE: waiting for AccessExclusiveLock + * 4. INSERT on QD: waiting for AccessShareLock at ExecutorEnd() + * + * 2 blocks 3, 1 blocks 4, 1 and 2 will not release their locks + * until 3 and 4 proceed. Hence INSERT needs to Lock the partition + * tables on QD here (before 2) to prevent this dead lock. + * + * FIXME: Ideally we may want to + * 1) Lock the partition table during the parse stage just as when + * we lock the root oid + * 2) Only lock the particular partition table that we are + * inserting into, right now we don't have that info here. + */ + lockmode = (operation == CMD_INSERT && rel_is_partitioned(relid)) ? + RowExclusiveLock : NoLock; + all_relids = find_all_inheritors(relid, lockmode, NULL); + } else { ListCell *lc; diff --git a/src/test/isolation2/input/uao/insert_while_vacuum_drop.source b/src/test/isolation2/input/uao/insert_while_vacuum_drop.source new file mode 100644 index 0000000000..88690ebc4b --- /dev/null +++ b/src/test/isolation2/input/uao/insert_while_vacuum_drop.source @@ -0,0 +1,55 @@ +-- @Description Ensures that an INSERT while VACUUM drop phase does not leave +-- the segfile state inconsistent on master and the primary. +-- +CREATE EXTENSION IF NOT EXISTS gp_inject_fault; +select gp_inject_fault('all', 'reset', 1); + +-- Helper function +CREATE or REPLACE FUNCTION wait_until_acquired_lock_on_rel (rel_name text, lmode text, segment_id integer) RETURNS /*in func*/ + bool AS $$ /*in func*/ +declare /*in func*/ + result bool; /*in func*/ +begin /*in func*/ +result := false; /*in func*/ +-- Wait until lock is acquired /*in func*/ +while result = false loop /*in func*/ +select l.granted INTO result /*in func*/ +from pg_locks l, pg_class c /*in func*/ +where l.relation = c.oid /*in func*/ + and c.relname=rel_name /*in func*/ + and l.mode=lmode /*in func*/ + and l.gp_segment_id=segment_id; /*in func*/ +if result = false then /*in func*/ + perform pg_sleep(0.1); /*in func*/ +end if; /*in func*/ +end loop; /*in func*/ +return result; /*in func*/ +end; /*in func*/ +$$ language plpgsql; + +-- Given an append only table with partitions that is ready to be compacted +CREATE TABLE insert_while_vacuum_drop_@orientation@ (a int, b int) with (appendonly=true, orientation=@orientation@) + DISTRIBUTED BY (a) + PARTITION BY RANGE (b) +(START (1) END (2) EVERY (1)); + +INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); +DELETE FROM insert_while_vacuum_drop_@orientation@; + +-- And VACUUM drop phase is blocked before it opens the child relation on the primary +SELECT gp_inject_fault('vacuum_relation_open_relation_during_drop_phase', 'suspend', dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; +1&: VACUUM insert_while_vacuum_drop_@orientation@; +SELECT gp_wait_until_triggered_fault('vacuum_relation_open_relation_during_drop_phase', 1, dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + +-- And INSERT is blocked until it acquires the RowExclusiveLock on the child relation +2&: INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); +SELECT wait_until_acquired_lock_on_rel('insert_while_vacuum_drop_@orientation@_1_prt_1', 'RowExclusiveLock', content) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + +-- Reset the fault on VACUUM and the two sessions should not be blocking each other +SELECT gp_inject_fault('vacuum_relation_open_relation_during_drop_phase', 'reset', dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; +1<: +2<: + +-- The following INSERT transaction should still work and should not fail on +-- "ERROR cannot insert into segno (1) for AO relid that is in state AOSEG_STATE_AWAITING_DROP" +INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); \ No newline at end of file diff --git a/src/test/isolation2/output/uao/insert_while_vacuum_drop.source b/src/test/isolation2/output/uao/insert_while_vacuum_drop.source new file mode 100644 index 0000000000..5a82189f6e --- /dev/null +++ b/src/test/isolation2/output/uao/insert_while_vacuum_drop.source @@ -0,0 +1,61 @@ +-- @Description Ensures that an INSERT while VACUUM drop phase does not leave +-- the segfile state inconsistent on master and the primary. +-- +CREATE EXTENSION IF NOT EXISTS gp_inject_fault; +CREATE +select gp_inject_fault('all', 'reset', 1); + gp_inject_fault +----------------- + t +(1 row) + +-- Helper function +CREATE or REPLACE FUNCTION wait_until_acquired_lock_on_rel (rel_name text, lmode text, segment_id integer) RETURNS /*in func*/ bool AS $$ /*in func*/ declare /*in func*/ result bool; /*in func*/ begin /*in func*/ result := false; /*in func*/ -- Wait until lock is acquired /*in func*/ +while result = false loop /*in func*/ select l.granted INTO result /*in func*/ from pg_locks l, pg_class c /*in func*/ where l.relation = c.oid /*in func*/ and c.relname=rel_name /*in func*/ and l.mode=lmode /*in func*/ and l.gp_segment_id=segment_id; /*in func*/ if result = false then /*in func*/ perform pg_sleep(0.1); /*in func*/ end if; /*in func*/ end loop; /*in func*/ return result; /*in func*/ end; /*in func*/ $$ language plpgsql; +CREATE + +-- Given an append only table with partitions that is ready to be compacted +CREATE TABLE insert_while_vacuum_drop_@orientation@ (a int, b int) with (appendonly=true, orientation=@orientation@) DISTRIBUTED BY (a) PARTITION BY RANGE (b) (START (1) END (2) EVERY (1)); +CREATE + +INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); +INSERT 1 +DELETE FROM insert_while_vacuum_drop_@orientation@; +DELETE 1 + +-- And VACUUM drop phase is blocked before it opens the child relation on the primary +SELECT gp_inject_fault('vacuum_relation_open_relation_during_drop_phase', 'suspend', dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + gp_inject_fault +----------------- + t +(1 row) +1&: VACUUM insert_while_vacuum_drop_@orientation@; +SELECT gp_wait_until_triggered_fault('vacuum_relation_open_relation_during_drop_phase', 1, dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + gp_wait_until_triggered_fault +------------------------------- + t +(1 row) + +-- And INSERT is blocked until it acquires the RowExclusiveLock on the child relation +2&: INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); +SELECT wait_until_acquired_lock_on_rel('insert_while_vacuum_drop_@orientation@_1_prt_1', 'RowExclusiveLock', content) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + wait_until_acquired_lock_on_rel +--------------------------------- + +(1 row) + +-- Reset the fault on VACUUM and the two sessions should not be blocking each other +SELECT gp_inject_fault('vacuum_relation_open_relation_during_drop_phase', 'reset', dbid) FROM gp_segment_configuration WHERE content = 1 AND role = 'p'; + gp_inject_fault +----------------- + t +(1 row) +1<: <... completed> +VACUUM +2<: <... completed> +INSERT 1 + +-- The following INSERT transaction should still work and should not fail on +-- "ERROR cannot insert into segno (1) for AO relid that is in state AOSEG_STATE_AWAITING_DROP" +INSERT INTO insert_while_vacuum_drop_@orientation@ VALUES (1, 1); +INSERT 1 diff --git a/src/test/regress/expected/partition_locking.out b/src/test/regress/expected/partition_locking.out index c08b960ae4..5eeeba8480 100644 --- a/src/test/regress/expected/partition_locking.out +++ b/src/test/regress/expected/partition_locking.out @@ -174,11 +174,14 @@ select * from locktest_master where coalesce not like 'gp_%' and coalesce not li partlockt | RowExclusiveLock | relation | master partlockt_1_prt_1 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_1 | AccessShareLock | relation | master + partlockt_1_prt_1 | RowExclusiveLock | relation | master partlockt_1_prt_2 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_2 | AccessShareLock | relation | master + partlockt_1_prt_2 | RowExclusiveLock | relation | master partlockt_1_prt_3 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_3 | AccessShareLock | relation | master -(7 rows) + partlockt_1_prt_3 | RowExclusiveLock | relation | master +(10 rows) select * from locktest_segments where coalesce not like 'gp_%' and coalesce not like 'pg_%'; coalesce | mode | locktype | node @@ -314,10 +317,19 @@ begin; -- insert locking insert into partlockt values(3, 'f'); select * from locktest_master where coalesce not like 'gp_%' and coalesce not like 'pg_%'; - coalesce | mode | locktype | node ------------+------------------+----------+-------- - partlockt | RowExclusiveLock | relation | master -(1 row) + coalesce | mode | locktype | node +-------------------+------------------+----------+-------- + partlockt | RowExclusiveLock | relation | master + partlockt_1_prt_1 | RowExclusiveLock | relation | master + partlockt_1_prt_2 | RowExclusiveLock | relation | master + partlockt_1_prt_3 | RowExclusiveLock | relation | master + partlockt_1_prt_4 | RowExclusiveLock | relation | master + partlockt_1_prt_5 | RowExclusiveLock | relation | master + partlockt_1_prt_6 | RowExclusiveLock | relation | master + partlockt_1_prt_7 | RowExclusiveLock | relation | master + partlockt_1_prt_8 | RowExclusiveLock | relation | master + partlockt_1_prt_9 | RowExclusiveLock | relation | master +(10 rows) select * from locktest_segments where coalesce not like 'gp_%' and coalesce not like 'pg_%'; coalesce | mode | locktype | node diff --git a/src/test/regress/expected/partition_locking_optimizer.out b/src/test/regress/expected/partition_locking_optimizer.out index cab015e6e8..54fa995a39 100644 --- a/src/test/regress/expected/partition_locking_optimizer.out +++ b/src/test/regress/expected/partition_locking_optimizer.out @@ -174,11 +174,14 @@ select * from locktest_master where coalesce not like 'gp_%' and coalesce not li partlockt | RowExclusiveLock | relation | master partlockt_1_prt_1 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_1 | AccessShareLock | relation | master + partlockt_1_prt_1 | RowExclusiveLock | relation | master partlockt_1_prt_2 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_2 | AccessShareLock | relation | master + partlockt_1_prt_2 | RowExclusiveLock | relation | master partlockt_1_prt_3 | AccessExclusiveLock | append-only segment file | master partlockt_1_prt_3 | AccessShareLock | relation | master -(7 rows) + partlockt_1_prt_3 | RowExclusiveLock | relation | master +(10 rows) select * from locktest_segments where coalesce not like 'gp_%' and coalesce not like 'pg_%'; coalesce | mode | locktype | node @@ -312,10 +315,19 @@ begin; -- insert locking insert into partlockt values(3, 'f'); select * from locktest_master where coalesce not like 'gp_%' and coalesce not like 'pg_%'; - coalesce | mode | locktype | node ------------+------------------+----------+-------- - partlockt | RowExclusiveLock | relation | master -(1 row) + coalesce | mode | locktype | node +-------------------+------------------+----------+-------- + partlockt | RowExclusiveLock | relation | master + partlockt_1_prt_1 | RowExclusiveLock | relation | master + partlockt_1_prt_2 | RowExclusiveLock | relation | master + partlockt_1_prt_3 | RowExclusiveLock | relation | master + partlockt_1_prt_4 | RowExclusiveLock | relation | master + partlockt_1_prt_5 | RowExclusiveLock | relation | master + partlockt_1_prt_6 | RowExclusiveLock | relation | master + partlockt_1_prt_7 | RowExclusiveLock | relation | master + partlockt_1_prt_8 | RowExclusiveLock | relation | master + partlockt_1_prt_9 | RowExclusiveLock | relation | master +(10 rows) select * from locktest_segments where coalesce not like 'gp_%' and coalesce not like 'pg_%'; coalesce | mode | locktype | node -- GitLab