提交 c5618e7f 编写于 作者: A antirez

WAIT command: synchronous replication for Redis.

上级 c2f30554
......@@ -130,6 +130,8 @@ void processUnblockedClients(void) {
void unblockClient(redisClient *c) {
if (c->btype == REDIS_BLOCKED_LIST) {
unblockClientWaitingData(c);
} else if (c->btype == REDIS_BLOCKED_WAIT) {
unblockClientWaitingReplicas(c);
} else {
redisPanic("Unknown btype in unblockClient().");
}
......@@ -147,6 +149,8 @@ void unblockClient(redisClient *c) {
void replyToBlockedClientTimedOut(redisClient *c) {
if (c->btype == REDIS_BLOCKED_LIST) {
addReply(c,shared.nullmultibulk);
} else if (c->btype == REDIS_BLOCKED_WAIT) {
addReplyLongLong(c,replicationCountAcksByOffset(c->bpop.reploffset));
} else {
redisPanic("Unknown btype in replyToBlockedClientTimedOut().");
}
......
......@@ -114,6 +114,7 @@ redisClient *createClient(int fd) {
c->bpop.target = NULL;
c->bpop.numreplicas = 0;
c->bpop.reploffset = 0;
c->woff = 0;
c->watched_keys = listCreate();
c->pubsub_channels = dictCreate(&setDictType,NULL);
c->pubsub_patterns = listCreate();
......
......@@ -263,7 +263,8 @@ struct redisCommand redisCommandTable[] = {
{"script",scriptCommand,-2,"ras",0,NULL,0,0,0,0,0},
{"time",timeCommand,1,"rR",0,NULL,0,0,0,0,0},
{"bitop",bitopCommand,-4,"wm",0,NULL,2,-1,1,0,0},
{"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0}
{"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0},
{"wait",waitCommand,3,"rs",0,NULL,0,0,0,0,0}
};
/*============================ Utility functions ============================ */
......@@ -1200,8 +1201,29 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
if (server.active_expire_enabled && server.masterhost == NULL)
activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST);
/* Send all the slaves an ACK request if at least one client blocked
* during the previous event loop iteration. */
if (server.get_ack_from_slaves) {
robj *argv[3];
argv[0] = createStringObject("REPLCONF",8);
argv[1] = createStringObject("GETACK",6);
argv[2] = createStringObject("*",1); /* Not used argument. */
replicationFeedSlaves(server.slaves, server.slaveseldb, argv, 3);
decrRefCount(argv[0]);
decrRefCount(argv[1]);
decrRefCount(argv[2]);
server.get_ack_from_slaves = 0;
}
/* Unblock all the clients blocked for synchronous replication
* in WAIT. */
if (listLength(server.clients_waiting_acks))
processClientsWaitingReplicas();
/* Try to process pending commands for clients that were just unblocked. */
processUnblockedClients();
if (listLength(server.unblocked_clients))
processUnblockedClients();
/* Write the AOF buffer on disk */
flushAppendOnlyFile(0);
......@@ -1557,6 +1579,8 @@ void initServer() {
server.slaveseldb = -1; /* Force to emit the first SELECT command. */
server.unblocked_clients = listCreate();
server.ready_keys = listCreate();
server.clients_waiting_acks = listCreate();
server.get_ack_from_slaves = 0;
createSharedObjects();
adjustOpenFilesLimit();
......@@ -2079,6 +2103,7 @@ int processCommand(redisClient *c) {
addReply(c,shared.queued);
} else {
call(c,REDIS_CALL_FULL);
c->woff = server.master_repl_offset;
if (listLength(server.ready_keys))
handleClientsBlockedOnLists();
}
......
......@@ -495,7 +495,8 @@ typedef struct redisClient {
int slave_listening_port; /* As configured with: SLAVECONF listening-port */
multiState mstate; /* MULTI/EXEC state */
int btype; /* Type of blocking op if REDIS_BLOCKED. */
blockingState bpop; /* blocking state */
blockingState bpop; /* blocking state */
long long woff; /* Last write global replication offset. */
list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
......@@ -750,6 +751,9 @@ struct redisServer {
dict *repl_scriptcache_dict; /* SHA1 all slaves are aware of. */
list *repl_scriptcache_fifo; /* First in, first out LRU eviction. */
int repl_scriptcache_size; /* Max number of elements. */
/* Synchronous replication. */
list *clients_waiting_acks; /* Clients waiting in WAIT command. */
int get_ack_from_slaves; /* If true we send REPLCONF GETACK. */
/* Limits */
unsigned int maxclients; /* Max number of simultaneous clients */
unsigned long long maxmemory; /* Max number of memory bytes to use */
......@@ -1063,6 +1067,9 @@ void replicationScriptCacheInit(void);
void replicationScriptCacheFlush(void);
void replicationScriptCacheAdd(sds sha1);
int replicationScriptCacheExists(sds sha1);
void processClientsWaitingReplicas(void);
void unblockClientWaitingReplicas(redisClient *c);
int replicationCountAcksByOffset(long long offset);
/* Generic persistence functions */
void startLoading(FILE *fp);
......@@ -1398,6 +1405,7 @@ void timeCommand(redisClient *c);
void bitopCommand(redisClient *c);
void bitcountCommand(redisClient *c);
void replconfCommand(redisClient *c);
void waitCommand(redisClient *c);
#if defined(__GNUC__)
void *calloc(size_t count, size_t size) __attribute__ ((deprecated));
......
......@@ -39,6 +39,7 @@
void replicationDiscardCachedMaster(void);
void replicationResurrectCachedMaster(int newfd);
void replicationSendAck(void);
/* ---------------------------------- MASTER -------------------------------- */
......@@ -560,6 +561,11 @@ void replconfCommand(redisClient *c) {
c->repl_ack_time = server.unixtime;
/* Note: this command does not reply anything! */
return;
} else if (!strcasecmp(c->argv[j]->ptr,"getack")) {
/* REPLCONF GETACK is used in order to request an ACK ASAP
* to the slave. */
if (server.masterhost && server.master) replicationSendAck();
/* Note: this command does not reply anything! */
} else {
addReplyErrorFormat(c,"Unrecognized REPLCONF option: %s",
(char*)c->argv[j]->ptr);
......@@ -1495,7 +1501,136 @@ int replicationScriptCacheExists(sds sha1) {
return dictFind(server.repl_scriptcache_dict,sha1) != NULL;
}
/* --------------------------- REPLICATION CRON ----------------------------- */
/* ----------------------- SYNCHRONOUS REPLICATION --------------------------
* Redis synchronous replication design can be summarized in points:
*
* - Redis masters have a global replication offset, used by PSYNC.
* - Master increment the offset every time new commands are sent to slaves.
* - Slaves ping back masters with the offset processed so far.
*
* So synchronous replication adds a new WAIT command in the form:
*
* WAIT <num_replicas> <milliseconds_timeout>
*
* That returns the number of replicas that processed the query when
* we finally have at least num_replicas, or when the timeout was
* reached.
*
* The command is implemented in this way:
*
* - Every time a client processes a command, we remember the replication
* offset after sending that command to the slaves.
* - When WAIT is called, we ask slaves to send an acknowledgement ASAP.
* The client is blocked at the same time (see blocked.c).
* - Once we receive enough ACKs for a given offset or when the timeout
* is reached, the WAIT command is unblocked and the reply sent to the
* client.
*/
/* This just set a flag so that we broadcast a REPLCONF GETACK command
* to all the slaves in the beforeSleep() function. Note that this way
* we "group" all the clients that want to wait for synchronouns replication
* in a given event loop iteration, and send a single GETACK for them all. */
void replicationRequestAckFromSlaves(void) {
server.get_ack_from_slaves = 1;
}
/* Return the number of slaves that already acknowledged the specified
* replication offset. */
int replicationCountAcksByOffset(long long offset) {
listIter li;
listNode *ln;
int count = 0;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
redisClient *slave = ln->value;
if (slave->replstate != REDIS_REPL_ONLINE) continue;
if (slave->repl_ack_off >= offset) count++;
}
return count;
}
/* WAIT for N replicas to acknowledge the processing of our latest
* write command (and all the previous commands). */
void waitCommand(redisClient *c) {
mstime_t timeout;
long numreplicas, ackreplicas;
long long offset = c->woff;
/* Argument parsing. */
if (getLongFromObjectOrReply(c,c->argv[1],&numreplicas,NULL) != REDIS_OK)
return;
if (getTimeoutFromObjectOrReply(c,c->argv[2],&timeout,UNIT_MILLISECONDS)
!= REDIS_OK) return;
/* First try without blocking at all. */
ackreplicas = replicationCountAcksByOffset(c->woff);
if (ackreplicas >= numreplicas || c->flags & REDIS_MULTI) {
addReplyLongLong(c,ackreplicas);
return;
}
/* Otherwise block the client and put it into our list of clients
* waiting for ack from slaves. */
c->bpop.timeout = timeout;
c->bpop.reploffset = offset;
c->bpop.numreplicas = numreplicas;
listAddNodeTail(server.clients_waiting_acks,c);
blockClient(c,REDIS_BLOCKED_WAIT);
/* Make sure that the server will send an ACK request to all the slaves
* before returning to the event loop. */
replicationRequestAckFromSlaves();
}
/* This is called by unblockClient() to perform the blocking op type
* specific cleanup. We just remove the client from the list of clients
* waiting for replica acks. Never call it directly, call unblockClient()
* instead. */
void unblockClientWaitingReplicas(redisClient *c) {
listNode *ln = listSearchKey(server.clients_waiting_acks,c);
redisAssert(ln != NULL);
listDelNode(server.clients_waiting_acks,ln);
}
/* Check if there are clients blocked in WAIT that can be unblocked since
* we received enough ACKs from slaves. */
void processClientsWaitingReplicas(void) {
long long last_offset = 0;
int last_numreplicas = 0;
listIter li;
listNode *ln;
listRewind(server.clients_waiting_acks,&li);
while((ln = listNext(&li))) {
redisClient *c = ln->value;
/* Every time we find a client that is satisfied for a given
* offset and number of replicas, we remember it so the next client
* may be unblocked without calling replicationCountAcksByOffset()
* if the requested offset / replicas were equal or less. */
if (last_offset && last_offset > c->bpop.reploffset &&
last_numreplicas > c->bpop.numreplicas)
{
unblockClient(c);
addReplyLongLong(c,last_numreplicas);
} else {
int numreplicas = replicationCountAcksByOffset(c->bpop.reploffset);
if (numreplicas >= c->bpop.numreplicas) {
last_offset = c->bpop.reploffset;
last_numreplicas = numreplicas;
unblockClient(c);
addReplyLongLong(c,numreplicas);
}
}
}
}
/* --------------------------- REPLICATION CRON ---------------------------- */
/* Replication cron funciton, called 1 time per second. */
void replicationCron(void) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册