cluster.h 14.0 KB
Newer Older
A
antirez 已提交
1 2
#ifndef __CLUSTER_H
#define __CLUSTER_H
3 4 5 6 7

/*-----------------------------------------------------------------------------
 * Redis cluster data structures, defines, exported API.
 *----------------------------------------------------------------------------*/

A
antirez 已提交
8 9 10 11 12
#define CLUSTER_SLOTS 16384
#define CLUSTER_OK 0          /* Everything looks ok */
#define CLUSTER_FAIL 1        /* The cluster can't work */
#define CLUSTER_NAMELEN 40    /* sha1 hex length */
#define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */
13

14
/* The following defines are amount of time, sometimes expressed as
15
 * multiplicators of the node timeout value (when ending with MULT). */
A
antirez 已提交
16 17 18 19 20 21
#define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
#define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
#define CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
#define CLUSTER_FAILOVER_DELAY 5 /* Seconds */
#define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
#define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
A
antirez 已提交
22
#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
23

24
/* Redirection errors returned by getNodeByQuery(). */
A
antirez 已提交
25 26 27 28 29 30 31
#define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
#define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */
#define CLUSTER_REDIR_UNSTABLE 2      /* -TRYAGAIN redirection required */
#define CLUSTER_REDIR_ASK 3           /* -ASK redirection required. */
#define CLUSTER_REDIR_MOVED 4         /* -MOVED redirection required. */
#define CLUSTER_REDIR_DOWN_STATE 5    /* -CLUSTERDOWN, global state. */
#define CLUSTER_REDIR_DOWN_UNBOUND 6  /* -CLUSTERDOWN, unbound slot. */
32
#define CLUSTER_REDIR_DOWN_RO_STATE 7 /* -CLUSTERDOWN, allow reads. */
33

34 35 36 37
struct clusterNode;

/* clusterLink encapsulates everything needed to talk with a remote node. */
typedef struct clusterLink {
38
    mstime_t ctime;             /* Link creation time */
39
    connection *conn;           /* Connection to remote node */
40 41 42 43 44
    sds sndbuf;                 /* Packet send buffer */
    sds rcvbuf;                 /* Packet reception buffer */
    struct clusterNode *node;   /* Node related to this link if any, or NULL */
} clusterLink;

45
/* Cluster node flags and macros. */
A
antirez 已提交
46 47 48 49 50 51 52 53
#define CLUSTER_NODE_MASTER 1     /* The node is a master */
#define CLUSTER_NODE_SLAVE 2      /* The node is a slave */
#define CLUSTER_NODE_PFAIL 4      /* Failure? Need acknowledge */
#define CLUSTER_NODE_FAIL 8       /* The node is believed to be malfunctioning */
#define CLUSTER_NODE_MYSELF 16    /* This node is myself */
#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
#define CLUSTER_NODE_NOADDR   64  /* We don't know the address of this node */
#define CLUSTER_NODE_MEET 128     /* Send a MEET message to this node */
54
#define CLUSTER_NODE_MIGRATE_TO 256 /* Master elegible for replica migration. */
55
#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failver. */
A
antirez 已提交
56
#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
57

A
antirez 已提交
58 59 60 61 62 63 64
#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
#define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR)
#define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
#define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
65
#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
66

67
/* Reasons why a slave is not able to failover. */
A
antirez 已提交
68 69 70 71 72 73
#define CLUSTER_CANT_FAILOVER_NONE 0
#define CLUSTER_CANT_FAILOVER_DATA_AGE 1
#define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
#define CLUSTER_CANT_FAILOVER_EXPIRED 3
#define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */
74

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
/* clusterState todo_before_sleep flags. */
#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
#define CLUSTER_TODO_UPDATE_STATE (1<<1)
#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)

/* Message types.
 *
 * Note that the PING, PONG and MEET messages are actually the same exact
 * kind of packet. PONG is the reply to ping, in the exact format as a PING,
 * while MEET is a special PING that forces the receiver to add the sender
 * as a node (if it is not already in the list). */
#define CLUSTERMSG_TYPE_PING 0          /* Ping */
#define CLUSTERMSG_TYPE_PONG 1          /* Pong (reply to Ping) */
#define CLUSTERMSG_TYPE_MEET 2          /* Meet "let's join" message */
#define CLUSTERMSG_TYPE_FAIL 3          /* Mark node xxx as failing */
#define CLUSTERMSG_TYPE_PUBLISH 4       /* Pub/Sub Publish propagation */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6     /* Yes, you have my vote */
#define CLUSTERMSG_TYPE_UPDATE 7        /* Another node slots configuration */
#define CLUSTERMSG_TYPE_MFSTART 8       /* Pause clients for manual failover */
96 97
#define CLUSTERMSG_TYPE_MODULE 9        /* Module cluster API message. */
#define CLUSTERMSG_TYPE_COUNT 10        /* Total number of message types. */
98

99 100 101
/* Flags that a module can set in order to prevent certain Redis Cluster
 * features to be enabled. Useful when implementing a different distributed
 * system on top of Redis Cluster message bus, using modules. */
102 103 104
#define CLUSTER_MODULE_FLAG_NONE 0
#define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1)
#define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2)
105

106
/* This structure represent elements of node->fail_reports. */
A
antirez 已提交
107
typedef struct clusterNodeFailReport {
108
    struct clusterNode *node;  /* Node reporting the failure condition. */
109
    mstime_t time;             /* Time of the last report from this node. */
A
antirez 已提交
110
} clusterNodeFailReport;
111

A
antirez 已提交
112
typedef struct clusterNode {
113
    mstime_t ctime; /* Node object creation time. */
A
antirez 已提交
114 115
    char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
    int flags;      /* CLUSTER_NODE_... */
116
    uint64_t configEpoch; /* Last configEpoch observed for this node */
A
antirez 已提交
117
    unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
118 119 120
    int numslots;   /* Number of slots handled by this node */
    int numslaves;  /* Number of slave nodes, if this is a master */
    struct clusterNode **slaves; /* pointers to slave nodes */
121 122 123 124
    struct clusterNode *slaveof; /* pointer to the master node. Note that it
                                    may be NULL even if the node is a slave
                                    if we don't have the master node in our
                                    tables. */
125 126
    mstime_t ping_sent;      /* Unix time we sent latest ping */
    mstime_t pong_received;  /* Unix time we received the pong */
127
    mstime_t data_received;  /* Unix time we received any data */
128 129 130
    mstime_t fail_time;      /* Unix time when FAIL flag was set */
    mstime_t voted_time;     /* Last time we voted for a slave of this master */
    mstime_t repl_offset_time;  /* Unix time we received offset for this node */
A
antirez 已提交
131
    mstime_t orphaned_time;     /* Starting time of orphaned master condition */
132
    long long repl_offset;      /* Last known repl offset for this node. */
A
antirez 已提交
133
    char ip[NET_IP_STR_LEN];  /* Latest known IP address of this node */
134 135
    int port;                   /* Latest known clients port of this node */
    int cport;                  /* Latest known cluster port of this node. */
136 137
    clusterLink *link;          /* TCP/IP link with this node */
    list *fail_reports;         /* List of nodes signaling this as failing */
A
antirez 已提交
138
} clusterNode;
139 140 141 142

typedef struct clusterState {
    clusterNode *myself;  /* This node */
    uint64_t currentEpoch;
A
antirez 已提交
143
    int state;            /* CLUSTER_OK, CLUSTER_FAIL, ... */
144 145
    int size;             /* Num of master nodes with at least one slot */
    dict *nodes;          /* Hash table of name -> clusterNode structures */
146
    dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
A
antirez 已提交
147 148 149
    clusterNode *migrating_slots_to[CLUSTER_SLOTS];
    clusterNode *importing_slots_from[CLUSTER_SLOTS];
    clusterNode *slots[CLUSTER_SLOTS];
150 151
    uint64_t slots_keys_count[CLUSTER_SLOTS];
    rax *slots_to_keys;
152
    /* The following fields are used to take the slave state on elections. */
153
    mstime_t failover_auth_time; /* Time of previous or next election. */
154 155
    int failover_auth_count;    /* Number of votes received so far. */
    int failover_auth_sent;     /* True if we already asked for votes. */
156
    int failover_auth_rank;     /* This slave rank for current auth request. */
157
    uint64_t failover_auth_epoch; /* Epoch of the current election. */
158 159
    int cant_failover_reason;   /* Why a slave is currently not able to
                                   failover. See the CANT_FAILOVER_* macros. */
160 161 162 163 164 165 166 167 168 169
    /* Manual failover state in common. */
    mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
                                   It is zero if there is no MF in progress. */
    /* Manual failover state of master. */
    clusterNode *mf_slave;      /* Slave performing the manual failover. */
    /* Manual failover state of slave. */
    long long mf_master_offset; /* Master offset the slave needs to start MF
                                   or zero if stil not received. */
    int mf_can_start;           /* If non-zero signal that the manual failover
                                   can start requesting masters vote. */
170
    /* The followign fields are used by masters to take state on elections. */
171
    uint64_t lastVoteEpoch;     /* Epoch of the last vote granted. */
172
    int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
173 174 175
    /* Messages received and sent by type. */
    long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
    long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
176 177
    long long stats_pfail_nodes;    /* Number of nodes in PFAIL status,
                                       excluding nodes without address. */
178 179 180 181 182 183 184 185
} clusterState;

/* Redis cluster messages header */

/* Initially we don't know our "name", but we'll find it once we connect
 * to the first node, using the getsockname() function. Then we'll use this
 * address for all the next messages. */
typedef struct {
A
antirez 已提交
186
    char nodename[CLUSTER_NAMELEN];
187 188
    uint32_t ping_sent;
    uint32_t pong_received;
A
antirez 已提交
189
    char ip[NET_IP_STR_LEN];  /* IP address last time it was seen */
190 191
    uint16_t port;              /* base port last time it was seen */
    uint16_t cport;             /* cluster port last time it was seen */
192
    uint16_t flags;             /* node->flags copy */
193
    uint32_t notused1;
194 195 196
} clusterMsgDataGossip;

typedef struct {
A
antirez 已提交
197
    char nodename[CLUSTER_NAMELEN];
198 199 200 201 202
} clusterMsgDataFail;

typedef struct {
    uint32_t channel_len;
    uint32_t message_len;
203
    unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */
204 205
} clusterMsgDataPublish;

206 207
typedef struct {
    uint64_t configEpoch; /* Config epoch of the specified instance. */
A
antirez 已提交
208 209
    char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
    unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */
210 211
} clusterMsgDataUpdate;

212 213 214 215 216 217 218
typedef struct {
    uint64_t module_id;     /* ID of the sender module. */
    uint32_t len;           /* ID of the sender module. */
    uint8_t type;           /* Type from 0 to 255. */
    unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
} clusterMsgModule;

219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
union clusterMsgData {
    /* PING, MEET and PONG */
    struct {
        /* Array of N clusterMsgDataGossip structures */
        clusterMsgDataGossip gossip[1];
    } ping;

    /* FAIL */
    struct {
        clusterMsgDataFail about;
    } fail;

    /* PUBLISH */
    struct {
        clusterMsgDataPublish msg;
    } publish;
235 236 237 238 239

    /* UPDATE */
    struct {
        clusterMsgDataUpdate nodecfg;
    } update;
240 241 242 243 244

    /* MODULE */
    struct {
        clusterMsgModule msg;
    } module;
245 246
};

247
#define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
248

249
typedef struct {
J
Jack Drogon 已提交
250
    char sig[4];        /* Signature "RCmb" (Redis Cluster message bus). */
251
    uint32_t totlen;    /* Total length of this message */
252
    uint16_t ver;       /* Protocol version, currently set to 1. */
253
    uint16_t port;      /* TCP base port number. */
254 255 256
    uint16_t type;      /* Message type */
    uint16_t count;     /* Only used for some kind of messages. */
    uint64_t currentEpoch;  /* The epoch accordingly to the sending node. */
A
antirez 已提交
257 258 259
    uint64_t configEpoch;   /* The config epoch if it's a master, or the last
                               epoch advertised by its master if it is a
                               slave. */
260 261
    uint64_t offset;    /* Master replication offset if node is a master or
                           processed replication offset if node is a slave. */
A
antirez 已提交
262 263 264
    char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
    unsigned char myslots[CLUSTER_SLOTS/8];
    char slaveof[CLUSTER_NAMELEN];
265 266
    char myip[NET_IP_STR_LEN];    /* Sender IP, if not all zeroed. */
    char notused1[34];  /* 34 bytes reserved for future usage. */
267 268
    uint16_t cport;      /* Sender TCP cluster bus port */
    uint16_t flags;      /* Sender node flags */
269
    unsigned char state; /* Cluster state from the POV of the sender */
270
    unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */
271 272 273 274 275
    union clusterMsgData data;
} clusterMsg;

#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))

276 277 278
/* Message flags better specify the packet content or are used to
 * provide some information about the node state. */
#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */
279 280
#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if
                                            master is up. */
281

A
antirez 已提交
282
/* ---------------------- API exported outside cluster.c -------------------- */
283 284 285
clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
int clusterRedirectBlockedClientIfNeeded(client *c);
void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code);
286
unsigned long getClusterConnectionsCount(void);
287

A
antirez 已提交
288
#endif /* __CLUSTER_H */