提交 fb9b76fe 编写于 作者: A antirez

Cluster: slave node now uses the new protocol to get elected.

上级 656c3ffe
......@@ -257,6 +257,7 @@ void clusterInit(void) {
server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster->failover_auth_time = 0;
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_epoch = 0;
memset(server.cluster->migrating_slots_to,0,
sizeof(server.cluster->migrating_slots_to));
memset(server.cluster->importing_slots_from,0,
......@@ -1581,16 +1582,22 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
*
* The gaol of this function is:
* 1) To check if we are able to perform a failover, is our data updated?
* 2) Ask reachable masters the authorization to perform the failover.
* 2) Try to get elected by masters.
* 3) Check if there is the majority of masters agreeing we should failover.
* 4) Perform the failover informing all the other nodes.
*/
void clusterHandleSlaveFailover(void) {
time_t data_age = server.unixtime - server.repl_down_since;
time_t auth_age = server.unixtime - server.cluster->failover_auth_time;
mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
int needed_quorum = (server.cluster->size / 2) + 1;
int j;
/* Remove the node timeout from the data age as it is fine that we are
* disconnected from our master at least for the time it was down to be
* flagged as FAIL, that's the baseline. */
if (data_age > server.cluster_node_timeout)
data_age -= server.cluster_node_timeout;
/* Check if our data is recent enough. For now we just use a fixed
* constant of ten times the node timeout since the cluster should
* react much faster to a master down. */
......@@ -1598,19 +1605,37 @@ void clusterHandleSlaveFailover(void) {
server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)
return;
/* TODO: check if we are the first slave as well? Or just rely on the
* master authorization? */
/* Ask masters if we are authorized to perform the failover. If there
* is a pending auth request that's too old, reset it. */
/* Compute the time at which we can start an election. */
if (server.cluster->failover_auth_time == 0 ||
auth_age >
server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT)
server.cluster_node_timeout * 1000 * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT)
{
redisLog(REDIS_WARNING,"Asking masters if I can failover...");
server.cluster->failover_auth_time = time(NULL);
server.cluster->failover_auth_time = mstime() +
500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
data_age * 100 + /* Add 100 milliseconds for every second of age. */
random() % 500; /* Random delay between 0 and 500 milliseconds. */
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_sent = 0;
redisLog(REDIS_WARNING,"Start of election delayed for %lld milliseconds.",
server.cluster->failover_auth_time - mstime());
return;
}
/* Return ASAP if we can't still start the election. */
if (mstime() < server.cluster->failover_auth_time) return;
/* Return ASAP if the election is too old to be valid. */
if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout)
return;
/* Ask for votes if needed. */
if (server.cluster->failover_auth_sent == 0) {
server.cluster->currentEpoch++;
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.",
server.cluster->currentEpoch);
clusterRequestFailoverAuth();
server.cluster->failover_auth_sent = 1;
return; /* Wait for replies. */
}
......@@ -1619,7 +1644,7 @@ void clusterHandleSlaveFailover(void) {
clusterNode *oldmaster = server.cluster->myself->slaveof;
redisLog(REDIS_WARNING,
"Masters quorum reached: failing over my (failing) master.");
"Failover election won: failing over my (failing) master.");
/* We have the quorum, perform all the steps to correctly promote
* this slave to a master.
*
......@@ -1644,7 +1669,10 @@ void clusterHandleSlaveFailover(void) {
* accordingly and detect that we switched to master role. */
clusterBroadcastPong();
/* 4) Update state and save config. */
/* 4) Update my configEpoch to the epoch of the election. */
server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch;
/* 5) Update state and save config. */
clusterUpdateState();
clusterSaveConfigOrDie();
}
......
......@@ -368,6 +368,8 @@
* Data types
*----------------------------------------------------------------------------*/
typedef long long mstime_t; /* millisecond time type. */
/* A redis object, that is a type able to hold a string / list / set */
/* The actual Redis Object */
......@@ -581,7 +583,7 @@ typedef struct redisOpArray {
#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 1 /* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */
struct clusterNode;
......@@ -643,8 +645,11 @@ typedef struct {
clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS];
clusterNode *slots[REDIS_CLUSTER_SLOTS];
zskiplist *slots_to_keys;
int failover_auth_time; /* Time at which we sent the AUTH request. */
int failover_auth_count; /* Number of authorizations received. */
/* The following fields are used to take the slave state on elections. */
mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms. */
int failover_auth_count; /* Number of votes received so far. */
int failover_auth_sent; /* True if we already asked for votes. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
} clusterState;
/* Redis cluster messages header */
......
......@@ -43,8 +43,6 @@ extern char **environ;
/* ======================== Sentinel global state =========================== */
typedef long long mstime_t; /* millisecond time type. */
/* Address object, used to describe an ip:port pair. */
typedef struct sentinelAddr {
char *ip;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册