From 62893f5b9ff0465bc6073c5e448d88f8ae352dab Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Fri, 20 Mar 2015 09:57:10 +0100
Subject: [PATCH] Cluster: better cluster state transiction handling.

Before we relied on the global cluster state to make sure all the hash
slots are linked to some node, when getNodeByQuery() is called. So
finding the hash slot unbound was checked with an assertion. However
this is fragile. The cluster state is often updated in the
clusterBeforeSleep() function, and not ASAP on state change, so it may
happen to process clients with a cluster state that is 'ok' but yet
certain hash slots set to NULL.

With this commit the condition is also checked in getNodeByQuery() and
reported with a identical error code of -CLUSTERDOWN but slightly
different error message so that we have more debugging clue in the
future.

Root cause of issue #2288.
---
 src/cluster.c | 20 ++++++++++++++++++--
 src/cluster.h |  1 +
 src/redis.c   |  2 ++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 80cd7701..178ce9b7 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -4690,7 +4690,12 @@ void readwriteCommand(redisClient *c) {
  *
  * REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys
  * belonging to the same slot, but the slot is not stable (in migration or
- * importing state, likely because a resharding is in progress). */
+ * importing state, likely because a resharding is in progress).
+ *
+ * REDIS_CLUSTER_REDIR_DOWN if the request addresses a slot which is not
+ * bound to any node. In this case the cluster global state should be already
+ * "down" but it is fragile to rely on the update of the global state, so
+ * we also handle it here. */
 clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
     clusterNode *n = NULL;
     robj *firstkey = NULL;
@@ -4744,7 +4749,18 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg
                 firstkey = thiskey;
                 slot = thisslot;
                 n = server.cluster->slots[slot];
-                redisAssertWithInfo(c,firstkey,n != NULL);
+
+                /* Error: If a slot is not served, we are in "cluster down"
+                 * state. However the state is yet to be updated, so this was
+                 * not trapped earlier in processCommand(). Report the same
+                 * error to the client. */
+                if (n == NULL) {
+                    getKeysFreeResult(keyindex);
+                    if (error_code)
+                        *error_code = REDIS_CLUSTER_REDIR_DOWN;
+                    return NULL;
+                }
+
                 /* If we are migrating or importing this slot, we need to check
                  * if we have all the keys in the request (the only way we
                  * can safely serve the request, otherwise we return a TRYAGAIN
diff --git a/src/cluster.h b/src/cluster.h
index ef5caf0d..8eaa0ab9 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -30,6 +30,7 @@
 #define REDIS_CLUSTER_REDIR_UNSTABLE 2      /* Keys in slot resharding. */
 #define REDIS_CLUSTER_REDIR_ASK 3           /* -ASK redirection required. */
 #define REDIS_CLUSTER_REDIR_MOVED 4         /* -MOVED redirection required. */
+#define REDIS_CLUSTER_REDIR_DOWN 5          /* -CLUSTERDOWN error. */
 
 struct clusterNode;
 
diff --git a/src/redis.c b/src/redis.c
index 3ffc62d0..918cae1e 100644
--- a/src/redis.c
+++ b/src/redis.c
@@ -2178,6 +2178,8 @@ int processCommand(redisClient *c) {
                      * but the slot is not "stable" currently as there is
                      * a migration or import in progress. */
                     addReplySds(c,sdsnew("-TRYAGAIN Multiple keys request during rehashing of slot\r\n"));
+                } else if (error_code == REDIS_CLUSTER_REDIR_DOWN) {
+                    addReplySds(c,sdsnew("-CLUSTERDOWN The cluster is down. Hash slot is unbound\r\n"));
                 } else {
                     redisPanic("getNodeByQuery() unknown error.");
                 }
-- 
GitLab