From af3bbc3e5380600a2e83953748a037ab8f615e26 Mon Sep 17 00:00:00 2001
From: Paul Guo <paulguo@gmail.com>
Date: Tue, 3 Dec 2019 15:24:11 +0800
Subject: [PATCH] Do not terminate the connection and quit in
 SyncRepWaitForLSN().

We previously had the code to terminate the connection if needed on QE to avoid
potential data inconsistency. This is gpdb specific since upstream code there
seems to be not friendly for failover + data consistency. However that
introduces various abort or assert failures since apparently some shm exit
callback functions are not friendly to the current transaction state. Below are
some stack examples.

Originally I fixed them in those callback functions but I found on gpdb6 after
I fixed one, another one (in another callback function) come out.  That's why I
could collect so many gpdb 6 stacks below. I just collect one gpdb master
stack, but I it should have more stacks also if we fixing in those callbacks
one by one. Anyway finally I decide to fix by delaying the ereport(FATAL)
exec_mpp_dtx_protocol_command() instead, and let QD retry 2PC to ensure the
data consistency. Note 1PC retry is currently not implemented but this should
be in another PR.

gpdb master (7) stack:

2  0x0000000000b48ddc in ExceptionalCondition (conditionName=0xe527e8 "!(ShmemAddrIsValid(nextElem))", errorType=0xe527bd "FailedAssertion",
    fileName=0xe527b2 "shmqueue.c", lineNumber=74) at assert.c:66
3  0x0000000000996311 in SHMQueueDelete (queue=0x7ff5e6676da8) at shmqueue.c:74
4  0x00000000009689de in SyncRepCleanupAtProcExit () at syncrep.c:436
5  0x00000000009a7b49 in ProcKill (code=1, arg=0) at proc.c:949
6  0x000000000098c001 in shmem_exit (code=1) at ipc.c:288
7  0x000000000098be5f in proc_exit_prepare (code=1) at ipc.c:212
8  0x000000000098bd64 in proc_exit (code=1) at ipc.c:104
9  0x0000000000b4a7d4 in errfinish (dummy=0) at elog.c:738
10 0x000000000096860e in SyncRepWaitForLSN (lsn=210148624, commit=1 '\001') at syncrep.c:303
11 0x000000000055c082 in RecordTransactionCommitPrepared (xid=638, gid=0x2c603ad "1575462785-0000000012", nchildren=0, children=0x2c6d2d0, nrels=0, rels=0x2c6d2d0,
	    ndeldbs=0, deldbs=0x2c6d2d0, ninvalmsgs=0, invalmsgs=0x2c6d2d0, initfileinval=0 '\000') at twophase.c:2283
12 0x000000000055aae3 in FinishPreparedTransaction (gid=0x2c603ad "1575462785-0000000012", isCommit=1 '\001', raiseErrorIfNotFound=0 '\000') at twophase.c:1493
13 0x0000000000c4e4fe in performDtxProtocolCommitPrepared (gid=0x2c603ad "1575462785-0000000012", raiseErrorIfNotFound=0 '\000') at cdbtm.c:2037
14 0x0000000000c4e9d5 in performDtxProtocolCommand (dtxProtocolCommand=DTX_PROTOCOL_COMMAND_RECOVERY_COMMIT_PREPARED, gid=0x2c603ad "1575462785-0000000012",
		    contextInfo=0x1220f20) at cdbtm.c:2215

gpdb 6 stacks:

2  0x0000000000ad9ea5 in ExceptionalCondition (conditionName=0xdbfddb "!(MyProc->syncRepState == 0)", errorType=0xdbfd28 "FailedAssertion",
   fileName=0xdbfcd0 "syncrep.c", lineNumber=130) at assert.c:66
3  0x000000000091ce81 in SyncRepWaitForLSN (XactCommitLSN=3400317528) at syncrep.c:130
4  0x000000000053991a in RecordTransactionCommit () at xact.c:1663
5  0x000000000053b0b2 in CommitTransaction () at xact.c:2756
6  0x000000000053c024 in CommitTransactionCommand () at xact.c:3646
7  0x00000000005c6c25 in RemoveTempRelationsCallback (code=1, arg=0) at namespace.c:4107
8  0x000000000093c353 in shmem_exit (code=1) at ipc.c:257
9  0x000000000093c248 in proc_exit_prepare (code=1) at ipc.c:214
10 0x000000000093c146 in proc_exit (code=1) at ipc.c:104
11 0x0000000000adb93d in errfinish (dummy=0) at elog.c:754
12 0x000000000091d2ef in SyncRepWaitForLSN (XactCommitLSN=3400294096) at syncrep.c:284
13 0x0000000000549d8e in EndPrepare (gxact=0x7f8a7d5fa0e0) at twophase.c:1241

3  0x0000000000ade6d1 in elog_finish (elevel=22, fmt=0xc3a898 "cannot abort transaction %u, it was already committed") at elog.c:1735
4  0x0000000000539d22 in RecordTransactionAbort (isSubXact=0 '\000') at xact.c:1923
5  0x000000000053b95c in AbortTransaction () at xact.c:3340
6  0x000000000053e0a7 in AbortOutOfAnyTransaction () at xact.c:5248
7  0x00000000005c68b9 in RemoveTempRelationsCallback (code=1, arg=0) at namespace.c:4088
8  0x000000000093c371 in shmem_exit (code=1) at ipc.c:257
9  0x000000000093c266 in proc_exit_prepare (code=1) at ipc.c:214
10 0x000000000093c164 in proc_exit (code=1) at ipc.c:104
11 0x0000000000adb94e in errfinish (dummy=0) at elog.c:754
12 0x000000000091d30d in SyncRepWaitForLSN (XactCommitLSN=19529538376) at syncrep.c:284
13 0x000000000053985a in RecordTransactionCommit () at xact.c:1663

2  0x0000000000adb9a9 in ExceptionalCondition (conditionName=0xdb2560 "!(entry->trans == ((void *)0))", errorType=0xdb2550 "FailedAssertion",
    fileName=0xdb216a "pgstat.c", lineNumber=842) at assert.c:66
3  0x00000000008d3391 in pgstat_report_stat (force=1 '\001') at pgstat.c:842
4  0x00000000008d65e8 in pgstat_beshutdown_hook (code=1, arg=0) at pgstat.c:2685
5  0x000000000093deba in shmem_exit (code=1) at ipc.c:290
6  0x000000000093dd1a in proc_exit_prepare (code=1) at ipc.c:214
7  0x000000000093dc18 in proc_exit (code=1) at ipc.c:104
8  0x0000000000add441 in errfinish (dummy=0) at elog.c:750
9  0x000000000091ee5c in SyncRepWaitForLSN (XactCommitLSN=225227432) at syncrep.c:333
10 0x0000000000549dd8 in EndPrepare (gxact=0x7f02508680e0) at twophase.c:1241

2  0x0000000000adb9c2 in ExceptionalCondition (conditionName=0xdcb458 "!(!((allPgXact[proc->pgprocno].xid) != ((TransactionId) 0)))",
    errorType=0xdcb408 "FailedAssertion", fileName=0xdcb3d9 "procarray.c", lineNumber=369) at assert.c:66
3  0x000000000093f614 in ProcArrayRemove (proc=0x7f4f1f5a05d0, latestXid=0) at procarray.c:369
4  0x00000000009586ec in RemoveProcFromArray (code=1, arg=0) at proc.c:904
5  0x000000000093ded3 in shmem_exit (code=1) at ipc.c:290
6  0x000000000093dd33 in proc_exit_prepare (code=1) at ipc.c:214
7  0x000000000093dc31 in proc_exit (code=1) at ipc.c:104
8  0x0000000000add45a in errfinish (dummy=0) at elog.c:750
9  0x000000000091ee75 in SyncRepWaitForLSN (XactCommitLSN=348629504) at syncrep.c:333
10 0x0000000000549dd8 in EndPrepare (gxact=0x7f4f1fa8cce0) at twophase.c:1241
11 0x000000000053b621 in PrepareTransaction () at xact.c:3115

Reviewed-by: Ashwin Agrawal <aagrawal@pivotal.io>
Reviewed-by: Asim R P <apraveen@pivotal.io>

Cherry-picked from 7b76173061871c4d6bbe974584917295cadf75df
---
 src/backend/replication/syncrep.c             |  16 ++-
 src/backend/tcop/postgres.c                   |  10 ++
 .../die_commit_pending_replication.out        | 107 ++++++++++++++++++
 src/test/isolation2/isolation2_schedule       |   2 +
 .../die_commit_pending_replication.sql        |  52 +++++++++
 5 files changed, 182 insertions(+), 5 deletions(-)
 create mode 100644 src/test/isolation2/expected/segwalrep/die_commit_pending_replication.out
 create mode 100644 src/test/isolation2/sql/segwalrep/die_commit_pending_replication.sql

diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index 1aff542ec7..8276d27d9c 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -263,6 +263,8 @@ SyncRepWaitForLSN(XLogRecPtr XactCommitLSN, bool commit)
 			break;
 		}
 
+		SIMPLE_FAULT_INJECTOR("sync_rep_query_die");
+
 		/*
 		 * If a wait for synchronous replication is pending, we can neither
 		 * acknowledge the commit nor raise ERROR or FATAL.  The latter would
@@ -278,13 +280,14 @@ SyncRepWaitForLSN(XLogRecPtr XactCommitLSN, bool commit)
 		if (ProcDiePending)
 		{
 			/*
-			 * FATAL only for QE's which use 2PC and hence can handle the
-			 * FATAL and retry.
+			 * For QE we should have done FATAL here so that 2PC can retry, but
+			 * FATAL here makes some shm exit callback functions panic or
+			 * assert fail because the transaction is still not finished, so
+			 * let's defer the quitting to exec_mpp_dtx_protocol_command().
 			 */
-			ereport(IS_QUERY_DISPATCHER() ? WARNING:FATAL,
+			ereport(WARNING,
 					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
-					 errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+					 errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command")));
 			whereToSendOutput = DestNone;
 			SyncRepCancelWait();
 			break;
@@ -320,6 +323,9 @@ SyncRepWaitForLSN(XLogRecPtr XactCommitLSN, bool commit)
 		if (!PostmasterIsAlive())
 		{
 			ProcDiePending = true;
+			ereport(WARNING,
+				(errcode(ERRCODE_ADMIN_SHUTDOWN),
+				 errmsg("canceling the wait for synchronous replication and terminating connection due to postmaster death")));
 			whereToSendOutput = DestNone;
 			SyncRepCancelWait();
 			break;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 888c31e259..a54ac7e3a3 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1523,6 +1523,16 @@ exec_mpp_dtx_protocol_command(DtxProtocolCommand dtxProtocolCommand,
 						Debug_dtm_action, DtxProtocolCommandToString(dtxProtocolCommand))));
 	}
 
+	/*
+	 * GPDB: There is a corner case that we need to delay connection
+	 * termination to here. see SyncRepWaitForLSN() for details.
+	 * */
+	if (ProcDiePending)
+		ereport(FATAL,
+				(errcode(ERRCODE_ADMIN_SHUTDOWN),
+				errmsg("Terminating the connection (DTM protocol command '%s' "
+					   "for gid=%s", loggingStr, gid)));
+
 	EndCommand(commandTag, dest);
 }
 
diff --git a/src/test/isolation2/expected/segwalrep/die_commit_pending_replication.out b/src/test/isolation2/expected/segwalrep/die_commit_pending_replication.out
new file mode 100644
index 0000000000..28d0e53b4a
--- /dev/null
+++ b/src/test/isolation2/expected/segwalrep/die_commit_pending_replication.out
@@ -0,0 +1,107 @@
+-- Adding `2` as first column as the distribution column.
+-- `2` should be on the first segment. let's double check here.
+create table store_session_id(a int, sess_id int);
+CREATE
+1: insert into store_session_id select 2, sess_id from pg_stat_activity where pid = pg_backend_pid();
+INSERT 1
+1: select gp_segment_id, a from store_session_id;
+ gp_segment_id | a 
+---------------+---
+ 0             | 2 
+(1 row)
+
+1: create table die_commit_pending_replication(a int, b int);
+CREATE
+
+-- Suspend to hit commit-prepared point on segment (as we are
+-- interested in testing Commit here and not really Prepare)
+select gp_inject_fault_infinite('finish_prepared_start_of_function', 'suspend', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+1&: insert into die_commit_pending_replication values(2),(1);  <waiting ...>
+select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 1, dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- Now pause the wal sender on primary for content 0
+select gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- Insert fault in the ProcDiePending code block.
+select gp_inject_fault_infinite('sync_rep_query_die', 'skip', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- Let the transaction move forward with the commit
+select gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- Ensure the ProcDiePending code block is called. It implies 'replication' in pg_stat_activity.
+select gp_wait_until_triggered_fault('sync_rep_query_die', 1, dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- We can terminate the backend on QE now.
+0U: select pg_terminate_backend(pid) from pg_stat_activity where waiting_reason='replication' and sess_id in (select sess_id from store_session_id);
+ pg_terminate_backend 
+----------------------
+ t                    
+(1 row)
+
+-- We expect two more occurrence: one for backend quitting and another for retry.
+select gp_wait_until_triggered_fault('sync_rep_query_die', 3, dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+
+-- Verify that the sess_id changes due to retry.
+0U: select pid,sess_id,waiting_reason,query from pg_stat_activity where sess_id in (select sess_id from store_session_id);
+ pid | sess_id | waiting_reason | query 
+-----+---------+----------------+-------
+(0 rows)
+
+-- resume the primary wal replication so that retry could complete.
+select gp_inject_fault('wal_sender_loop', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- reset sync_rep_query_die
+select gp_inject_fault('sync_rep_query_die', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+1<:  <... completed>
+INSERT 2
+
+-- check if the insert fails or not.
+select gp_segment_id, * from die_commit_pending_replication;
+ gp_segment_id | a | b 
+---------------+---+---
+ 0             | 2 |   
+ 1             | 1 |   
+(2 rows)
+
+-- cleanup
+drop table die_commit_pending_replication;
+DROP
+drop table store_session_id;
+DROP
diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule
index c68236ca23..05224adf1c 100644
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -179,6 +179,8 @@ test: uao/insert_should_not_use_awaiting_drop_column
 test: add_column_after_vacuum_skip_drop_column
 test: vacuum_after_vacuum_skip_drop_column
 
+test: segwalrep/die_commit_pending_replication
+
 # Tests for FTS
 test: fts_errors
 test: segwalrep/commit_blocking
diff --git a/src/test/isolation2/sql/segwalrep/die_commit_pending_replication.sql b/src/test/isolation2/sql/segwalrep/die_commit_pending_replication.sql
new file mode 100644
index 0000000000..f34bde11b8
--- /dev/null
+++ b/src/test/isolation2/sql/segwalrep/die_commit_pending_replication.sql
@@ -0,0 +1,52 @@
+-- Adding `2` as first column as the distribution column.
+-- `2` should be on the first segment. let's double check here.
+create table store_session_id(a int, sess_id int);
+1: insert into store_session_id select 2, sess_id from pg_stat_activity where pid = pg_backend_pid();
+1: select gp_segment_id, a from store_session_id;
+
+1: create table die_commit_pending_replication(a int, b int);
+
+-- Suspend to hit commit-prepared point on segment (as we are
+-- interested in testing Commit here and not really Prepare)
+select gp_inject_fault_infinite('finish_prepared_start_of_function', 'suspend', dbid) from gp_segment_configuration where role='p' and content = 0;
+1&: insert into die_commit_pending_replication values(2),(1);
+select gp_wait_until_triggered_fault('finish_prepared_start_of_function', 1, dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- Now pause the wal sender on primary for content 0
+select gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- Insert fault in the ProcDiePending code block.
+select gp_inject_fault_infinite('sync_rep_query_die', 'skip', dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- Let the transaction move forward with the commit
+select gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- Ensure the ProcDiePending code block is called. It implies 'replication' in pg_stat_activity.
+select gp_wait_until_triggered_fault('sync_rep_query_die', 1, dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- We can terminate the backend on QE now.
+0U: select pg_terminate_backend(pid) from pg_stat_activity
+      where waiting_reason='replication' and
+      sess_id in (select sess_id from store_session_id);
+
+-- We expect two more occurrence: one for backend quitting and another for retry.
+select gp_wait_until_triggered_fault('sync_rep_query_die', 3, dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- Verify that the sess_id changes due to retry.
+0U: select pid,sess_id,waiting_reason,query from pg_stat_activity
+      where sess_id in (select sess_id from store_session_id);
+
+-- resume the primary wal replication so that retry could complete.
+select gp_inject_fault('wal_sender_loop', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+
+-- reset sync_rep_query_die
+select gp_inject_fault('sync_rep_query_die', 'reset', dbid) from gp_segment_configuration where role='p' and content = 0;
+
+1<:
+
+-- check if the insert fails or not.
+select gp_segment_id, * from die_commit_pending_replication;
+
+-- cleanup
+drop table die_commit_pending_replication;
+drop table store_session_id;
-- 
GitLab