From 62893f5b9ff0465bc6073c5e448d88f8ae352dab Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Mar 2015 09:57:10 +0100 Subject: [PATCH] Cluster: better cluster state transiction handling. Before we relied on the global cluster state to make sure all the hash slots are linked to some node, when getNodeByQuery() is called. So finding the hash slot unbound was checked with an assertion. However this is fragile. The cluster state is often updated in the clusterBeforeSleep() function, and not ASAP on state change, so it may happen to process clients with a cluster state that is 'ok' but yet certain hash slots set to NULL. With this commit the condition is also checked in getNodeByQuery() and reported with a identical error code of -CLUSTERDOWN but slightly different error message so that we have more debugging clue in the future. Root cause of issue #2288. --- src/cluster.c | 20 ++++++++++++++++++-- src/cluster.h | 1 + src/redis.c | 2 ++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 80cd7701..178ce9b7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -4690,7 +4690,12 @@ void readwriteCommand(redisClient *c) { * * REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys * belonging to the same slot, but the slot is not stable (in migration or - * importing state, likely because a resharding is in progress). */ + * importing state, likely because a resharding is in progress). + * + * REDIS_CLUSTER_REDIR_DOWN if the request addresses a slot which is not + * bound to any node. In this case the cluster global state should be already + * "down" but it is fragile to rely on the update of the global state, so + * we also handle it here. */ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { clusterNode *n = NULL; robj *firstkey = NULL; @@ -4744,7 +4749,18 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg firstkey = thiskey; slot = thisslot; n = server.cluster->slots[slot]; - redisAssertWithInfo(c,firstkey,n != NULL); + + /* Error: If a slot is not served, we are in "cluster down" + * state. However the state is yet to be updated, so this was + * not trapped earlier in processCommand(). Report the same + * error to the client. */ + if (n == NULL) { + getKeysFreeResult(keyindex); + if (error_code) + *error_code = REDIS_CLUSTER_REDIR_DOWN; + return NULL; + } + /* If we are migrating or importing this slot, we need to check * if we have all the keys in the request (the only way we * can safely serve the request, otherwise we return a TRYAGAIN diff --git a/src/cluster.h b/src/cluster.h index ef5caf0d..8eaa0ab9 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -30,6 +30,7 @@ #define REDIS_CLUSTER_REDIR_UNSTABLE 2 /* Keys in slot resharding. */ #define REDIS_CLUSTER_REDIR_ASK 3 /* -ASK redirection required. */ #define REDIS_CLUSTER_REDIR_MOVED 4 /* -MOVED redirection required. */ +#define REDIS_CLUSTER_REDIR_DOWN 5 /* -CLUSTERDOWN error. */ struct clusterNode; diff --git a/src/redis.c b/src/redis.c index 3ffc62d0..918cae1e 100644 --- a/src/redis.c +++ b/src/redis.c @@ -2178,6 +2178,8 @@ int processCommand(redisClient *c) { * but the slot is not "stable" currently as there is * a migration or import in progress. */ addReplySds(c,sdsnew("-TRYAGAIN Multiple keys request during rehashing of slot\r\n")); + } else if (error_code == REDIS_CLUSTER_REDIR_DOWN) { + addReplySds(c,sdsnew("-CLUSTERDOWN The cluster is down. Hash slot is unbound\r\n")); } else { redisPanic("getNodeByQuery() unknown error."); } -- GitLab