From 3c9bb8751a440c9f4bde69ceb54b624f52f589f6 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Fri, 20 Sep 2013 11:26:44 +0200
Subject: [PATCH] Cluster: PFAIL -> FAIL transition allowed for slaves.

First change: now there is no need to be a master in order to detect a
failure, however the majority of masters signaling PFAIL or FAIL is needed.

This change is important because it allows slaves rejoining the cluster
after a partition to sense the FAIL condition so that eventually all the
nodes agree on failures.
---
 src/cluster.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index f85b7569..e562c00c 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -594,25 +594,36 @@ void clusterRenameNode(clusterNode *node, char *newname) {
 /* This function checks if a given node should be marked as FAIL.
  * It happens if the following conditions are met:
  *
- * 1) We are a master node. Only master nodes can mark a node as failing.
- * 2) We received enough failure reports from other nodes via gossip.
- *    Enough means that the majority of the masters believe the node is
- *    down.
- * 3) We believe this node is in PFAIL state.
+ * 1) We received enough failure reports from other master nodes via gossip.
+ *    Enough means that the majority of the masters signaled the node is
+ *    down recently.
+ * 2) We believe this node is in PFAIL state.
  *
  * If a failure is detected we also inform the whole cluster about this
  * event trying to force every other node to set the FAIL flag for the node.
+ *
+ * Note that the form of agreement used here is weak, as we collect the majority
+ * of masters state during some time, and even if we force agreement by
+ * propagating the FAIL message, because of partitions we may not reach every
+ * node. However:
+ *
+ * 1) Either we reach the majority and eventually the FAIL state will propagate
+ *    to all the cluster.
+ * 2) Or there is no majority so no slave promotion will be authorized and the
+ *    FAIL flag will be cleared after some time.
  */
 void markNodeAsFailingIfNeeded(clusterNode *node) {
     int failures;
     int needed_quorum = (server.cluster->size / 2) + 1;
 
-    if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return;
     if (!(node->flags & REDIS_NODE_PFAIL)) return; /* We can reach it. */
     if (node->flags & REDIS_NODE_FAIL) return; /* Already FAILing. */
 
-    failures = 1 + clusterNodeFailureReportsCount(node); /* +1 is for myself. */
-    if (failures < needed_quorum) return;
+    failures = clusterNodeFailureReportsCount(node);
+    /* Also count myself as a voter if I'm a master. */
+    if (server.cluster->myself->flags & REDIS_NODE_MASTER)
+        failures += 1;
+    if (failures < needed_quorum) return; /* No weak agreement from masters. */
 
     redisLog(REDIS_NOTICE,
         "Marking node %.40s as failing (quorum reached).", node->name);
@@ -622,8 +633,10 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
     node->flags |= REDIS_NODE_FAIL;
     node->fail_time = time(NULL);
 
-    /* Broadcast the failing node name to everybody */
-    clusterSendFail(node->name);
+    /* Broadcast the failing node name to everybody, forcing all the other
+     * reachable nodes to flag the node as FAIL. */
+    if (server.cluster->myself->flags & REDIS_NODE_MASTER)
+        clusterSendFail(node->name);
     clusterUpdateState();
     clusterSaveConfigOrDie();
 }
-- 
GitLab