提交 3119f4f6 编写于 作者: A antirez

Cluster: better timeout and retry time for failover.

When node-timeout is too small, in the order of a few milliseconds,
there is no way the voting process can terminate during that time, so we
set a lower limit for the failover timeout of two seconds.

The retry time is set to two times the failover timeout time, so it is
at least 4 seconds.
上级 14f77b34
......@@ -2063,6 +2063,18 @@ void clusterHandleSlaveFailover(void) {
int manual_failover = server.cluster->mf_end != 0 &&
server.cluster->mf_can_start;
int j;
mstime_t auth_timeout, auth_retry_time;
/* Compute the failover timeout (the max time we have to send votes
* and wait for replies), and the failover retry time (the time to wait
* before waiting again.
*
* Timeout is MIN(NODE_TIMEOUT*2,2000) milliseconds.
* Retry is two times the Timeout.
*/
auth_timeout = server.cluster_node_timeout*2;
if (auth_timeout < 2000) auth_timeout = 2000;
auth_retry_time = auth_timeout*2;
/* Pre conditions to run the function:
* 1) We are a slave.
......@@ -2073,8 +2085,6 @@ void clusterHandleSlaveFailover(void) {
(!nodeFailed(myself->slaveof) && !manual_failover) ||
myself->slaveof->numslots == 0) return;
/* If this is a manual failover, are we ready to start? */
/* Set data_age to the number of seconds we are disconnected from
* the master. */
if (server.repl_state == REDIS_REPL_CONNECTED) {
......@@ -2097,10 +2107,9 @@ void clusterHandleSlaveFailover(void) {
(server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT))
return;
/* Compute the time at which we can start an election. */
if (auth_age >
server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT)
{
/* If the previous failover attempt timedout and the retry time has
* elapsed, we can setup a new one. */
if (auth_age > auth_retry_time) {
server.cluster->failover_auth_time = mstime() +
500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
random() % 500; /* Random delay between 0 and 500 milliseconds. */
......@@ -2152,7 +2161,7 @@ void clusterHandleSlaveFailover(void) {
if (mstime() < server.cluster->failover_auth_time) return;
/* Return ASAP if the election is too old to be valid. */
if (auth_age > server.cluster_node_timeout) return;
if (auth_age > auth_timeout) return;
/* Ask for votes if needed. */
if (server.cluster->failover_auth_sent == 0) {
......
......@@ -18,7 +18,6 @@
#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */
#define REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER 1
#define REDIS_CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册