From 6b323078695d1d1ab81a4b640990a666ea91f252 Mon Sep 17 00:00:00 2001 From: Kenan Yao Date: Fri, 12 Aug 2016 18:00:57 +0800 Subject: [PATCH] Fix the postmaster reset failure on master node If a QD crashes for reasons such as SIGSEGV, SIGKILL or PANIC, postmaster reset fails sometimes. The root cause is: postmaster would first tell child processes to exit, and then wait for the termination of important processes such as AutoVacuum, BgWriter, CheckPoint etc, before it resets share memory and restarts auxiliary processes; however, WAL writer process is missed in the waiting list, so it can happen that postmaster spawns StartupProcess and then notices the exit of WAL writer, so it tells StartupProcess to exit; then postmaster would notice the abnormal exit of StartupProcess in turn, and treats it as recovery failure, then call exit() itself. Thus, we end up with no postmaster process on master node at all. This happens almost everytime when master host machine has poor performance. --- src/backend/postmaster/postmaster.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 72d5cf3bf8..780cc1b447 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -4209,6 +4209,7 @@ do_immediate_shutdown_reaper(void) zeroIfPidEqual(pid, &BgWriterPID); zeroIfPidEqual(pid, &CheckpointPID); zeroIfPidEqual(pid, &WalReceiverPID); + zeroIfPidEqual(pid, &WalWriterPID); zeroIfPidEqual(pid, &AutoVacPID); zeroIfPidEqual(pid, &PgArchPID); zeroIfPidEqual(pid, &PgStatPID); @@ -5041,6 +5042,7 @@ static void do_reaper() FilerepPeerResetPID != 0 || AutoVacPID != 0 || WalReceiverPID != 0 || + WalWriterPID != 0 || ServiceProcessesExist(0)) { /* important child is still going...wait longer */ @@ -5147,6 +5149,8 @@ GetServerProcessTitle(int pid) return "background writer process"; if (pid == CheckpointPID) return "checkpoint process"; + else if (pid == WalWriterPID) + return "walwriter process"; else if (pid == WalReceiverPID) return "walreceiver process"; else if (pid == AutoVacPID) @@ -5642,13 +5646,13 @@ static PMState StateMachineCheck_WaitBackends(void) } else { - // note: if wal writer is added, check this here: WalWriterPID == 0 && int childCount = CountChildren(BACKEND_TYPE_AUTOVAC|BACKEND_TYPE_NORMAL); bool isFilerepBackendsDoneShutdown = IsFilerepBackendsDoneShutdown(); bool autovacShutdown = AutoVacPID == 0; if (childCount == 0 && WalReceiverPID == 0 && + WalWriterPID == 0 && (BgWriterPID == 0 || !FatalError) && /* todo: CHAD_PM why wait for BgWriterPID here? Can't we just allow normal state advancement to hit there? */ autovacShutdown && @@ -5873,6 +5877,9 @@ static void StateMachineTransition_ShutdownBackends(void) /* and the autovac launcher too */ signal_child_if_up(AutoVacPID, SIGTERM); + /* and the wal writer too */ + signal_child_if_up(WalWriterPID, SIGTERM); + signal_filerep_to_shutdown(SegmentStateShutdownFilerepBackends); } -- GitLab