From a445aa30a0fb72bc54da083998b3458fad92820f Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 13:00:41 +0200 Subject: [PATCH] Cluster: master node now uses new protocol to vote. --- src/cluster.c | 69 +++++++++++++++++++++++++-------------------------- src/redis.h | 3 +++ 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 9ce2905e..24d3efe3 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -258,6 +258,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_epoch = 0; + server.cluster->last_vote_epoch = 0; memset(server.cluster->migrating_slots_to,0, sizeof(server.cluster->migrating_slots_to)); memset(server.cluster->importing_slots_from,0, @@ -396,6 +397,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { memset(node->ip,0,sizeof(node->ip)); node->port = 0; node->fail_reports = listCreate(); + node->voted_time = 0; listSetFreeMethod(node->fail_reports,zfree); return node; } @@ -1178,15 +1180,18 @@ int clusterProcessPacket(clusterLink *link) { } } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) { if (!sender) return 1; /* We don't know that node. */ - /* If we are not a master, ignore that message at all. */ - if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return 0; clusterSendFailoverAuthIfNeeded(sender,hdr); } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { if (!sender) return 1; /* We don't know that node. */ - /* If this is a master, increment the number of acknowledges - * we received so far. */ - if (sender->flags & REDIS_NODE_MASTER) + /* We consider this vote only if the sender if a master serving + * a non zero number of slots, with the currentEpoch that is equal + * to our currentEpoch. */ + if (sender->flags & REDIS_NODE_MASTER && + sender->numslots > 0 && + senderCurrentEpoch == server.cluster->currentEpoch) + { server.cluster->failover_auth_count++; + } } else { redisLog(REDIS_WARNING,"Received unknown packet type: %d", type); } @@ -1538,43 +1543,38 @@ void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) { clusterSendMessage(node->link,buf,totlen); } -/* If we believe 'node' is the "first slave" of it's master, reply with - * a FAILOVER_AUTH_GRANTED packet. - * The 'request' field points to the authorization request packet header, we - * need it in order to copy back the 'time' field in our reply. - * - * To be a first slave the sender must: - * 1) Be a slave. - * 2) Its master should be in FAIL state. - * 3) Ordering all the slaves IDs for its master by run-id, it should be the - * first (the smallest) among the ones not in FAIL / PFAIL state. - */ +/* Vote for the node asking for our vote if there are the conditions. */ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { - char first[REDIS_CLUSTER_NAMELEN]; clusterNode *master = node->slaveof; - int j; + uint64_t requestEpoch = ntohu64(request->currentEpoch); + + /* IF we are not a master serving at least 1 slot, we don't have the + * right to vote, as the cluster size in Redis Cluster is the number + * of masters serving at least one slot, and quorum is the cluster size + 1 */ + if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return; + if (server.cluster->myself->numslots == 0) return; + + /* Request epoch must be >= our currentEpoch. */ + if (requestEpoch < server.cluster->currentEpoch) return; - /* Node is a slave? Its master is down? */ + /* I already voted for this epoch? Return ASAP. */ + if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return; + + /* Node must be a slave and its master down. */ if (!(node->flags & REDIS_NODE_SLAVE) || master == NULL || !(master->flags & REDIS_NODE_FAIL)) return; - /* Iterate all the master slaves to check what's the first one. */ - memset(first,0xff,sizeof(first)); - for (j = 0; j < master->numslaves; j++) { - clusterNode *slave = master->slaves[j]; - - if (slave->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) continue; - if (memcmp(slave->name,first,sizeof(first)) < 0) { - memcpy(first,slave->name,sizeof(first)); - } - } - - /* Is 'node' the first slave? */ - if (memcmp(node->name,first,sizeof(first)) != 0) return; + /* We did not voted for a slave about this master for two + * times the node timeout. This is not strictly needed for correctness + * of the algorithm but makes the base case more linear. */ + if (server.unixtime - node->slaveof->voted_time < + server.cluster_node_timeout * 2) return; - /* We can send the packet. */ + /* We can vote for this slave. */ clusterSendFailoverAuth(node,request->time); + server.cluster->last_vote_epoch = server.cluster->currentEpoch; + node->slaveof->voted_time = server.unixtime; } /* This function is called if we are a slave node and our master serving @@ -1583,8 +1583,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * The gaol of this function is: * 1) To check if we are able to perform a failover, is our data updated? * 2) Try to get elected by masters. - * 3) Check if there is the majority of masters agreeing we should failover. - * 4) Perform the failover informing all the other nodes. + * 3) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { time_t data_age = server.unixtime - server.repl_down_since; diff --git a/src/redis.h b/src/redis.h index 2b7ca7a0..66c751a1 100644 --- a/src/redis.h +++ b/src/redis.h @@ -628,6 +628,7 @@ struct clusterNode { time_t ping_sent; /* Unix time we sent latest ping */ time_t pong_received; /* Unix time we received the pong */ time_t fail_time; /* Unix time when FAIL flag was set */ + time_t voted_time; /* Last time we voted for a slave of this master */ char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ int port; /* Latest known port of this node */ clusterLink *link; /* TCP/IP link with this node */ @@ -650,6 +651,8 @@ typedef struct { int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ + /* The followign fields are uesd by masters to take state on elections. */ + uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ } clusterState; /* Redis cluster messages header */ -- GitLab