提交 fdfbefa7 编写于 作者: A Ashwin Agrawal

Introduce GUC to control fts mirror down grace period.

上级 4c39375c
......@@ -134,17 +134,11 @@ int gp_fts_probe_timeout = 20;
int gp_fts_probe_interval = 60;
/*
* Number of threads to use for probe of segments (it is a good idea to have this
* larger than the number of segments per host.
* If mirror disconnects and re-connects between this period, or just takes
* this much time during initial connection of cluster start, it will not get
* reported as down to FTS.
*/
int gp_fts_probe_threadcount = 16;
/* The number of retries to request a segment state transition. */
int gp_fts_transition_retries = 5;
/* Timeout to request a segment state transition. */
int gp_fts_transition_timeout = 3600;
int gp_fts_mark_mirror_down_grace_period = 30;
/*
* When we have certain types of failures during gang creation which indicate
......
......@@ -13,6 +13,7 @@
#include "postgres.h"
#include "pgtime.h"
#include "cdb/cdbvars.h"
#include "replication/gp_replication.h"
#include "replication/walreceiver.h"
#include "replication/walsender_private.h"
......@@ -21,13 +22,6 @@
/* Set at database system is ready to accept connections */
extern pg_time_t PMAcceptingConnectionsStartTime;
/*
* If mirror disconnects and re-connects between this period, or just takes
* this much time during initial connection of cluster start, it will not get
* reported as down to FTS.
*/
#define FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD 30 /* secs */
/*
* Check the WalSndCtl to obtain if mirror is up or down, if the wal sender is
* in streaming, and if synchronous replication is enabled or not.
......@@ -71,11 +65,11 @@ GetMirrorStatus(FtsResponse *response)
* glitch. During this period, request FTS to probe again.
*
* If the delta is negative, then it's overflowed, meaning it's
* over FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD since either last
* over gp_fts_mark_mirror_down_grace_period since either last
* database accepting connections or last time wal sender
* died. Then, we can safely mark the mirror is down.
*/
if (delta < FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD && delta >= 0)
if (delta < gp_fts_mark_mirror_down_grace_period && delta >= 0)
{
ereport(LOG,
(errmsg("requesting fts retry as mirror didn't connect yet but in grace period: " INT64_FORMAT, delta),
......
......@@ -79,7 +79,7 @@ test_GetMirrorStatus_Pid_Zero(void **state)
* duration is taken into account.
*/
data.walsnds[0].marked_pid_zero_at_time =
((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period;
/*
* Ensure the recovery finished before wal sender died.
......@@ -107,12 +107,12 @@ test_GetMirrorStatus_RequestRetry(void **state)
* Make the pid zero time within the grace period.
*/
data.walsnds[0].marked_pid_zero_at_time =
((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD/2;
((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period/2;
/*
* Ensure recovery finished before wal sender died.
*/
PMAcceptingConnectionsStartTime = data.walsnds[0].marked_pid_zero_at_time - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
PMAcceptingConnectionsStartTime = data.walsnds[0].marked_pid_zero_at_time - gp_fts_mark_mirror_down_grace_period;
expect_lwlock(LW_SHARED);
expect_ereport();
......@@ -138,14 +138,14 @@ test_GetMirrorStatus_Delayed_AcceptingConnectionsStartTime(void **state)
* Mirror will be marked down, and no retry.
*/
data.walsnds[0].marked_pid_zero_at_time =
((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD;
((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period;
/*
* However the database was in recovery, hence
* we are still within the grace period, and
* we should retry.
*/
PMAcceptingConnectionsStartTime = ((pg_time_t) time(NULL)) - FTS_MARKING_MIRROR_DOWN_GRACE_PERIOD/2;
PMAcceptingConnectionsStartTime = ((pg_time_t) time(NULL)) - gp_fts_mark_mirror_down_grace_period/2;
expect_lwlock(LW_SHARED);
expect_ereport();
......
......@@ -3502,6 +3502,16 @@ struct config_int ConfigureNamesInt_gp[] =
60, 10, 3600, NULL, NULL
},
{
{"gp_fts_mark_mirror_down_grace_period", PGC_SIGHUP, GP_ARRAY_TUNING,
gettext_noop("Time (in seconds) allowed to mirror after disconnection, to reconnect before being marked as down in configuration by FTS."),
gettext_noop("Used by the fts-probe process."),
GUC_UNIT_S
},
&gp_fts_mark_mirror_down_grace_period,
30, 0, 3600, NULL, NULL
},
{
{"gp_gang_creation_retry_count", PGC_USERSET, GP_ARRAY_TUNING,
gettext_noop("After a gang-creation fails, retry the number of times if failure is retryable."),
......
......@@ -346,7 +346,7 @@ extern int gp_snapshotadd_timeout; /* GUC var - timeout specifier for snapshot-c
extern int gp_fts_probe_retries; /* GUC var - specifies probe number of retries for FTS */
extern int gp_fts_probe_timeout; /* GUC var - specifies probe timeout for FTS */
extern int gp_fts_probe_interval; /* GUC var - specifies polling interval for FTS */
extern int gp_fts_probe_threadcount; /* GUC var - specifies number of threads to use for FTS probes */
extern int gp_fts_mark_mirror_down_grace_period;
extern int gp_gang_creation_retry_count; /* How many retries ? */
extern int gp_gang_creation_retry_timer; /* How long between retries */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册