Introduce GUC to control fts mirror down grace period.

fdfbefa7 · Ashwin Agrawal · 4c39375c · fdfbefa7 · fdfbefa7 · fdfbefa7
5 changed file
--- a/src/backend/cdb/cdbvars.c
+++ b/src/backend/cdb/cdbvars.c
@@ -134,17 +134,11 @@ int			gp_fts_probe_timeout = 20;
 int			gp_fts_probe_interval = 60;

 /*
- * Number of threads to use for probe of segments (it is a good idea to have this
- * larger than the number of segments per host.
+ * If mirror disconnects and re-connects between this period, or just takes
+ * this much time during initial connection of cluster start, it will not get
+ * reported as down to FTS.
 */
-int			gp_fts_probe_threadcount = 16;
-
-/* The number of retries to request a segment state transition. */
-int			gp_fts_transition_retries = 5;
-
-/* Timeout to request a segment state transition. */
-int			gp_fts_transition_timeout = 3600;
-
+int gp_fts_mark_mirror_down_grace_period = 30;

 /*
 * When we have certain types of failures during gang creation which indicate

--- a/src/backend/replication/gp_replication.c
+++ b/src/backend/replication/gp_replication.c
@@ -13,6 +13,7 @@
 #include "postgres.h"

 #include "pgtime.h"
+#include "cdb/cdbvars.h"
 #include "replication/gp_replication.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender_private.h"
@@ -21,13 +22,6 @@
 /* Set at database system is ready to accept connections */
 extern pg_time_t PMAcceptingConnectionsStartTime;

-/*
- * If mirror disconnects and re-connects between this period, or just takes
- * this much time during initial connection of cluster start, it will not get
- * reported as down to FTS.
- */
-#define FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD 30 /* secs */
-
 /*
 * Check the WalSndCtl to obtain if mirror is up or down, if the wal sender is
 * in streaming, and if synchronous replication is enabled or not.
@@ -71,11 +65,11 @@ GetMirrorStatus(FtsResponse *response)
 			 * glitch. During this period, request FTS to probe again.
 			 *
 			 * If the delta is negative, then it's overflowed, meaning it's
-			 * over FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD since either last
+			 * over gp_fts_mark_mirror_down_grace_period since either last
 			 * database accepting connections or last time wal sender
 			 * died. Then, we can safely mark the mirror is down.
 			 */
-			if (delta < FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD && delta >= 0)
+			if (delta < gp_fts_mark_mirror_down_grace_period && delta >= 0)
 			{
 				ereport(LOG,
 						(errmsg("requesting fts retry as mirror didn't connect yet but in grace period: " INT64_FORMAT, delta),

--- a/src/backend/replication/test/gp_replication_test.c
+++ b/src/backend/replication/test/gp_replication_test.c
@@ -79,7 +79,7 @@ test_GetMirrorStatus_Pid_Zero(void **state)
 	 * duration is taken into account.
 	 */
 	data.walsnds[0].marked_pid_zero_at_time =
-		((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
+		((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period;

 	/*
 	 * Ensure the recovery finished before wal sender died.
@@ -107,12 +107,12 @@ test_GetMirrorStatus_RequestRetry(void **state)
 	 * Make the pid zero time within the grace period.
 	 */
 	data.walsnds[0].marked_pid_zero_at_time =
-		((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD/2;
+		((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period/2;

 	/*
 	 * Ensure recovery finished before wal sender died.
 	 */
-	PMAcceptingConnectionsStartTime = data.walsnds[0].marked_pid_zero_at_time - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
+	PMAcceptingConnectionsStartTime = data.walsnds[0].marked_pid_zero_at_time - gp_fts_mark_mirror_down_grace_period;

 	expect_lwlock(LW_SHARED);
 	expect_ereport();
@@ -138,14 +138,14 @@ test_GetMirrorStatus_Delayed_AcceptingConnectionsStartTime(void **state)
 	 * Mirror will be marked down, and no retry.
 	 */
 	data.walsnds[0].marked_pid_zero_at_time =
-		((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
+		((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period;

 	/*
 	 * However the database was in recovery, hence
 	 * we are still within the grace period, and
 	 * we should retry.
 	 */
-	PMAcceptingConnectionsStartTime = ((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD/2;
+	PMAcceptingConnectionsStartTime = ((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period/2;

 	expect_lwlock(LW_SHARED);
 	expect_ereport();

--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -3502,6 +3502,16 @@ struct config_int ConfigureNamesInt_gp[] =
 		60, 10, 3600, NULL, NULL
 	},

+	{
+		{"gp_fts_mark_mirror_down_grace_period", PGC_SIGHUP, GP_ARRAY_TUNING,
+			gettext_noop("Time (in seconds) allowed to mirror after disconnection, to reconnect before being marked as down in configuration by FTS."),
+			gettext_noop("Used by the fts-probe process."),
+			GUC_UNIT_S
+		},
+		&gp_fts_mark_mirror_down_grace_period,
+		30, 0, 3600, NULL, NULL
+	},
+
 	{
 		{"gp_gang_creation_retry_count", PGC_USERSET, GP_ARRAY_TUNING,
 			gettext_noop("After a gang-creation fails, retry the number of times if failure is retryable."),

--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -346,7 +346,7 @@ extern int	gp_snapshotadd_timeout; /* GUC var - timeout specifier for snapshot-c
 extern int	gp_fts_probe_retries; /* GUC var - specifies probe number of retries for FTS */
 extern int	gp_fts_probe_timeout; /* GUC var - specifies probe timeout for FTS */
 extern int	gp_fts_probe_interval; /* GUC var - specifies polling interval for FTS */
-extern int	gp_fts_probe_threadcount; /* GUC var - specifies number of threads to use for FTS probes */
+extern int gp_fts_mark_mirror_down_grace_period;

 extern int gp_gang_creation_retry_count; /* How many retries ? */
 extern int gp_gang_creation_retry_timer; /* How long between retries */