redis.c 339.9 KB
Newer Older
A
antirez 已提交
1
/*
2
 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
A
antirez 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *   * Neither the name of Redis nor the names of its contributors may be used
 *     to endorse or promote products derived from this software without
 *     specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

A
antirez 已提交
30
#define REDIS_VERSION "1.3.8"
31 32

#include "fmacros.h"
A
antirez 已提交
33
#include "config.h"
A
antirez 已提交
34 35 36 37 38 39

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
H
hrothgar 已提交
40
#define __USE_POSIX199309
A
antirez 已提交
41
#define __USE_UNIX98
A
antirez 已提交
42
#include <signal.h>
A
antirez 已提交
43 44

#ifdef HAVE_BACKTRACE
H
hrothgar 已提交
45 46
#include <execinfo.h>
#include <ucontext.h>
A
antirez 已提交
47 48
#endif /* HAVE_BACKTRACE */

A
antirez 已提交
49 50 51 52 53 54 55 56 57 58 59
#include <sys/wait.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <inttypes.h>
#include <arpa/inet.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/resource.h>
60
#include <sys/uio.h>
61
#include <limits.h>
62
#include <math.h>
63
#include <pthread.h>
A
antirez 已提交
64 65

#if defined(__sun)
A
antirez 已提交
66 67
#include "solarisfixes.h"
#endif
A
antirez 已提交
68

H
hrothgar 已提交
69
#include "redis.h"
A
antirez 已提交
70 71 72 73 74 75
#include "ae.h"     /* Event driven programming library */
#include "sds.h"    /* Dynamic safe strings */
#include "anet.h"   /* Networking the easy way */
#include "dict.h"   /* Hash tables */
#include "adlist.h" /* Linked lists */
#include "zmalloc.h" /* total memory usage aware version of malloc/free */
76 77
#include "lzf.h"    /* LZF compression library */
#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
78
#include "zipmap.h"
A
antirez 已提交
79 80 81 82 83 84 85 86

/* Error codes */
#define REDIS_OK                0
#define REDIS_ERR               -1

/* Static server configuration */
#define REDIS_SERVERPORT        6379    /* TCP port */
#define REDIS_MAXIDLETIME       (60*5)  /* default client timeout */
87
#define REDIS_IOBUF_LEN         1024
A
antirez 已提交
88
#define REDIS_LOADBUF_LEN       1024
89
#define REDIS_STATIC_ARGS       8
A
antirez 已提交
90 91 92 93
#define REDIS_DEFAULT_DBNUM     16
#define REDIS_CONFIGLINE_MAX    1024
#define REDIS_OBJFREELIST_MAX   1000000 /* Max number of objects to cache */
#define REDIS_MAX_SYNC_TIME     60      /* Slave can't take more to sync */
94
#define REDIS_EXPIRELOOKUPS_PER_CRON    10 /* try to expire 10 keys/loop */
95
#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
96 97 98 99 100 101
#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */

/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
#define REDIS_WRITEV_THRESHOLD      3
/* Max number of iovecs used for each writev call */
#define REDIS_WRITEV_IOVEC_COUNT    256
A
antirez 已提交
102 103 104 105 106

/* Hash table parameters */
#define REDIS_HT_MINFILL        10      /* Minimal hash table fill 10% */

/* Command flags */
A
antirez 已提交
107 108 109 110 111 112 113
#define REDIS_CMD_BULK          1       /* Bulk write command */
#define REDIS_CMD_INLINE        2       /* Inline command */
/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
   this flags will return an error when the 'maxmemory' option is set in the
   config file and the server is using more than maxmemory bytes of memory.
   In short this commands are denied on low memory conditions. */
#define REDIS_CMD_DENYOOM       4
114
#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
A
antirez 已提交
115 116 117 118 119

/* Object types */
#define REDIS_STRING 0
#define REDIS_LIST 1
#define REDIS_SET 2
120 121
#define REDIS_ZSET 3
#define REDIS_HASH 4
122

123 124 125
/* Objects encoding. Some kind of objects like Strings and Hashes can be
 * internally represented in multiple ways. The 'encoding' field of the object
 * is set to one of this fields for this object. */
126 127
#define REDIS_ENCODING_RAW 0    /* Raw representation */
#define REDIS_ENCODING_INT 1    /* Encoded as integer */
128 129
#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
#define REDIS_ENCODING_HT 3     /* Encoded as an hash table */
130

131 132 133 134
static char* strencoding[] = {
    "raw", "int", "zipmap", "hashtable"
};

135
/* Object types only used for dumping to disk */
A
antirez 已提交
136
#define REDIS_EXPIRETIME 253
A
antirez 已提交
137 138 139
#define REDIS_SELECTDB 254
#define REDIS_EOF 255

140 141 142 143 144 145 146
/* Defines related to the dump file format. To store 32 bits lengths for short
 * keys requires a lot of space, so we check the most significant 2 bits of
 * the first byte to interpreter the length:
 *
 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
 * 01|000000 00000000 =>  01, the len is 14 byes, 6 bits + 8 bits of next byte
 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
A
antirez 已提交
147 148 149
 * 11|000000 this means: specially encoded object will follow. The six bits
 *           number specify the kind of object that follows.
 *           See the REDIS_RDB_ENC_* defines.
150
 *
151 152
 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
 * values, will fit inside. */
153 154 155
#define REDIS_RDB_6BITLEN 0
#define REDIS_RDB_14BITLEN 1
#define REDIS_RDB_32BITLEN 2
156
#define REDIS_RDB_ENCVAL 3
157 158
#define REDIS_RDB_LENERR UINT_MAX

A
antirez 已提交
159 160 161 162 163 164
/* When a length of a string object stored on disk has the first two bits
 * set, the remaining two bits specify a special encoding for the object
 * accordingly to the following defines: */
#define REDIS_RDB_ENC_INT8 0        /* 8 bit signed integer */
#define REDIS_RDB_ENC_INT16 1       /* 16 bit signed integer */
#define REDIS_RDB_ENC_INT32 2       /* 32 bit signed integer */
A
antirez 已提交
165
#define REDIS_RDB_ENC_LZF 3         /* string compressed with FASTLZ */
A
antirez 已提交
166

167 168 169 170 171 172
/* Virtual memory object->where field. */
#define REDIS_VM_MEMORY 0       /* The object is on memory */
#define REDIS_VM_SWAPPED 1      /* The object is on disk */
#define REDIS_VM_SWAPPING 2     /* Redis is swapping this object on disk */
#define REDIS_VM_LOADING 3      /* Redis is loading this object from disk */

A
antirez 已提交
173 174 175 176
/* Virtual memory static configuration stuff.
 * Check vmFindContiguousPages() to know more about this magic numbers. */
#define REDIS_VM_MAX_NEAR_PAGES 65536
#define REDIS_VM_MAX_RANDOM_JUMP 4096
177
#define REDIS_VM_MAX_THREADS 32
178
#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
179 180 181 182
/* The following is the *percentage* of completed I/O jobs to process when the
 * handelr is called. While Virtual Memory I/O operations are performed by
 * threads, this operations must be processed by the main thread when completed
 * in order to take effect. */
183
#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
A
antirez 已提交
184

A
antirez 已提交
185
/* Client flags */
A
antirez 已提交
186 187 188 189 190 191
#define REDIS_SLAVE 1       /* This client is a slave server */
#define REDIS_MASTER 2      /* This client is a master server */
#define REDIS_MONITOR 4     /* This client is a slave monitor, see MONITOR */
#define REDIS_MULTI 8       /* This client is in a MULTI context */
#define REDIS_BLOCKED 16    /* The client is waiting in a blocking operation */
#define REDIS_IO_WAIT 32    /* The client is waiting for Virtual Memory I/O */
A
antirez 已提交
192

193
/* Slave replication state - slave side */
A
antirez 已提交
194 195 196 197
#define REDIS_REPL_NONE 0   /* No active replication */
#define REDIS_REPL_CONNECT 1    /* Must connect to master */
#define REDIS_REPL_CONNECTED 2  /* Connected to master */

198 199 200 201 202 203 204 205 206
/* Slave replication state - from the point of view of master
 * Note that in SEND_BULK and ONLINE state the slave receives new updates
 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
 * to start the next background saving in order to send updates to it. */
#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */

A
antirez 已提交
207 208 209 210 211 212
/* List related stuff */
#define REDIS_HEAD 0
#define REDIS_TAIL 1

/* Sort operations */
#define REDIS_SORT_GET 0
A
antirez 已提交
213 214
#define REDIS_SORT_ASC 1
#define REDIS_SORT_DESC 2
A
antirez 已提交
215 216 217 218
#define REDIS_SORTKEY_MAX 1024

/* Log levels */
#define REDIS_DEBUG 0
219 220 221
#define REDIS_VERBOSE 1
#define REDIS_NOTICE 2
#define REDIS_WARNING 3
A
antirez 已提交
222 223 224 225

/* Anti-warning macro... */
#define REDIS_NOTUSED(V) ((void) V)

226 227
#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
#define ZSKIPLIST_P 0.25      /* Skiplist P = 1/4 */
A
antirez 已提交
228

229 230 231 232 233
/* Append only defines */
#define APPENDFSYNC_NO 0
#define APPENDFSYNC_ALWAYS 1
#define APPENDFSYNC_EVERYSEC 2

234 235 236 237
/* Hashes related defaults */
#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
#define REDIS_HASH_MAX_ZIPMAP_VALUE 512

238
/* We can print the stacktrace, so our assert is defined this way: */
239
#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
240
static void _redisAssert(char *estr, char *file, int line);
241

A
antirez 已提交
242 243 244
/*================================= Data types ============================== */

/* A redis object, that is a type able to hold a string / list / set */
245 246 247

/* The VM object structure */
struct redisObjectVM {
A
antirez 已提交
248 249 250
    off_t page;         /* the page at witch the object is stored on disk */
    off_t usedpages;    /* number of pages used on disk */
    time_t atime;       /* Last access time */
251 252 253
} vm;

/* The actual Redis Object */
A
antirez 已提交
254 255
typedef struct redisObject {
    void *ptr;
256 257
    unsigned char type;
    unsigned char encoding;
258 259 260 261
    unsigned char storage;  /* If this object is a key, where is the value?
                             * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
    unsigned char vtype; /* If this object is a key, and value is swapped out,
                          * this is the type of the swapped out object. */
A
antirez 已提交
262
    int refcount;
263 264 265 266 267
    /* VM fields, this are only allocated if VM is active, otherwise the
     * object allocation function will just allocate
     * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
     * Redis without VM active will not have any overhead. */
    struct redisObjectVM vm;
A
antirez 已提交
268 269
} robj;

270 271 272 273 274 275 276 277 278
/* Macro used to initalize a Redis object allocated on the stack.
 * Note that this macro is taken near the structure definition to make sure
 * we'll update it when the structure is changed, to avoid bugs like
 * bug #85 introduced exactly in this way. */
#define initStaticStringObject(_var,_ptr) do { \
    _var.refcount = 1; \
    _var.type = REDIS_STRING; \
    _var.encoding = REDIS_ENCODING_RAW; \
    _var.ptr = _ptr; \
A
antirez 已提交
279
    if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
280 281
} while(0);

A
antirez 已提交
282
typedef struct redisDb {
A
antirez 已提交
283 284 285
    dict *dict;                 /* The keyspace for this DB */
    dict *expires;              /* Timeout of keys with a timeout set */
    dict *blockingkeys;         /* Keys with clients waiting for data (BLPOP) */
A
antirez 已提交
286
    dict *io_keys;              /* Keys with clients waiting for VM I/O */
A
antirez 已提交
287 288 289
    int id;
} redisDb;

A
antirez 已提交
290 291 292 293 294 295 296 297 298 299 300 301
/* Client MULTI/EXEC state */
typedef struct multiCmd {
    robj **argv;
    int argc;
    struct redisCommand *cmd;
} multiCmd;

typedef struct multiState {
    multiCmd *commands;     /* Array of MULTI commands */
    int count;              /* Total number of MULTI commands */
} multiState;

A
antirez 已提交
302 303 304 305
/* With multiplexing we need to take per-clinet state.
 * Clients are taken in a liked list. */
typedef struct redisClient {
    int fd;
A
antirez 已提交
306
    redisDb *db;
A
antirez 已提交
307 308
    int dictid;
    sds querybuf;
309 310
    robj **argv, **mbargv;
    int argc, mbargc;
311
    int bulklen;            /* bulk read len. -1 if not in bulk read mode */
312
    int multibulk;          /* multi bulk command format active */
A
antirez 已提交
313 314 315
    list *reply;
    int sentlen;
    time_t lastinteraction; /* time of the last interaction, used for timeout */
A
antirez 已提交
316
    int flags;              /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
317 318 319 320
    int slaveseldb;         /* slave selected db, if this client is a slave */
    int authenticated;      /* when requirepass is non-NULL */
    int replstate;          /* replication state if this is a slave */
    int repldbfd;           /* replication DB file descriptor */
A
antirez 已提交
321
    long repldboff;         /* replication DB file offset */
322
    off_t repldbsize;       /* replication DB file size */
A
antirez 已提交
323
    multiState mstate;      /* MULTI/EXEC state */
A
antirez 已提交
324
    robj **blockingkeys;    /* The key we are waiting to terminate a blocking
A
antirez 已提交
325
                             * operation such as BLPOP. Otherwise NULL. */
326
    int blockingkeysnum;    /* Number of blocking keys */
A
antirez 已提交
327 328
    time_t blockingto;      /* Blocking operation timeout. If UNIX current time
                             * is >= blockingto then the operation timed out. */
329 330
    list *io_keys;          /* Keys this client is waiting to be loaded from the
                             * swap file in order to continue. */
A
antirez 已提交
331 332
    dict *pubsub_channels;  /* channels a client is interested in (SUBSCRIBE) */
    list *pubsub_patterns;  /* patterns a client is interested in (SUBSCRIBE) */
A
antirez 已提交
333 334 335 336 337 338 339 340 341 342 343
} redisClient;

struct saveparam {
    time_t seconds;
    int changes;
};

/* Global server state structure */
struct redisServer {
    int port;
    int fd;
A
antirez 已提交
344
    redisDb *db;
A
antirez 已提交
345 346
    long long dirty;            /* changes to DB from the last save */
    list *clients;
A
antirez 已提交
347
    list *slaves, *monitors;
A
antirez 已提交
348 349 350 351 352 353 354 355 356
    char neterr[ANET_ERR_LEN];
    aeEventLoop *el;
    int cronloops;              /* number of times the cron function run */
    list *objfreelist;          /* A list of freed objects to avoid malloc() */
    time_t lastsave;            /* Unix time of last save succeeede */
    /* Fields used only for stats */
    time_t stat_starttime;         /* server start time */
    long long stat_numcommands;    /* number of processed commands */
    long long stat_numconnections; /* number of connections received */
A
antirez 已提交
357
    long long stat_expiredkeys;   /* number of expired keys */
A
antirez 已提交
358 359 360 361 362 363
    /* Configuration */
    int verbosity;
    int glueoutputbuf;
    int maxidletime;
    int dbnum;
    int daemonize;
364
    int appendonly;
365 366
    int appendfsync;
    time_t lastfsync;
367 368
    int appendfd;
    int appendseldb;
369
    char *pidfile;
370
    pid_t bgsavechildpid;
371 372
    pid_t bgrewritechildpid;
    sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
A
antirez 已提交
373 374 375 376 377
    struct saveparam *saveparams;
    int saveparamslen;
    char *logfile;
    char *bindaddr;
    char *dbfilename;
378
    char *appendfilename;
B
Brian Hammond 已提交
379
    char *requirepass;
380
    int shareobjects;
381
    int rdbcompression;
A
antirez 已提交
382 383
    /* Replication related */
    int isslave;
384
    char *masterauth;
A
antirez 已提交
385 386
    char *masterhost;
    int masterport;
387
    redisClient *master;    /* client that is master for this slave */
A
antirez 已提交
388
    int replstate;
389
    unsigned int maxclients;
A
antirez 已提交
390
    unsigned long long maxmemory;
A
antirez 已提交
391 392
    unsigned int blpop_blocked_clients;
    unsigned int vm_blocked_clients;
A
antirez 已提交
393 394 395 396 397
    /* Sort parameters - qsort_r() is only available under BSD so we
     * have to take this state global, in order to pass it to sortCompare() */
    int sort_desc;
    int sort_alpha;
    int sort_bypattern;
398 399
    /* Virtual memory configuration */
    int vm_enabled;
400
    char *vm_swap_file;
401 402
    off_t vm_page_size;
    off_t vm_pages;
A
antirez 已提交
403
    unsigned long long vm_max_memory;
404 405 406
    /* Hashes config */
    size_t hash_max_zipmap_entries;
    size_t hash_max_zipmap_value;
407 408 409 410 411
    /* Virtual memory state */
    FILE *vm_fp;
    int vm_fd;
    off_t vm_next_page; /* Next probably empty page */
    off_t vm_near_pages; /* Number of pages allocated sequentially */
A
antirez 已提交
412
    unsigned char *vm_bitmap; /* Bitmap of free/used pages */
A
antirez 已提交
413
    time_t unixtime;    /* Unix time sampled every second. */
414 415
    /* Virtual memory I/O threads stuff */
    /* An I/O thread process an element taken from the io_jobs queue and
416 417 418 419 420
     * put the result of the operation in the io_done list. While the
     * job is being processed, it's put on io_processing queue. */
    list *io_newjobs; /* List of VM I/O jobs yet to be processed */
    list *io_processing; /* List of VM I/O jobs being processed */
    list *io_processed; /* List of VM I/O jobs already processed */
A
antirez 已提交
421
    list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
422
    pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
423 424
    pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
    pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
425
    pthread_attr_t io_threads_attr; /* attributes for threads creation */
426 427
    int io_active_threads; /* Number of running I/O threads */
    int vm_max_threads; /* Max number of I/O threads running at the same time */
428 429 430 431 432 433
    /* Our main thread is blocked on the event loop, locking for sockets ready
     * to be read or written, so when a threaded I/O operation is ready to be
     * processed by the main thread, the I/O thread will use a unix pipe to
     * awake the main thread. The followings are the two pipe FDs. */
    int io_ready_pipe_read;
    int io_ready_pipe_write;
A
antirez 已提交
434 435 436 437 438
    /* Virtual memory stats */
    unsigned long long vm_stats_used_pages;
    unsigned long long vm_stats_swapped_objects;
    unsigned long long vm_stats_swapouts;
    unsigned long long vm_stats_swapins;
A
antirez 已提交
439
    /* Pubsub */
A
antirez 已提交
440 441
    dict *pubsub_channels; /* Map channels to list of subscribed clients */
    list *pubsub_patterns; /* A list of pubsub_patterns */
A
antirez 已提交
442
    /* Misc */
443
    FILE *devnull;
A
antirez 已提交
444 445
};

A
antirez 已提交
446 447 448 449 450
typedef struct pubsubPattern {
    redisClient *client;
    robj *pattern;
} pubsubPattern;

A
antirez 已提交
451 452 453 454 455 456
typedef void redisCommandProc(redisClient *c);
struct redisCommand {
    char *name;
    redisCommandProc *proc;
    int arity;
    int flags;
457 458 459 460
    /* Use a function to determine which keys need to be loaded
     * in the background prior to executing this command. Takes precedence
     * over vm_firstkey and others, ignored when NULL */
    redisCommandProc *vm_preload_proc;
461 462 463 464
    /* What keys should be loaded in background when calling this command? */
    int vm_firstkey; /* The first argument that's a key (0 = no keys) */
    int vm_lastkey;  /* THe last argument that's a key */
    int vm_keystep;  /* The step between first and last key */
A
antirez 已提交
465 466
};

467 468
struct redisFunctionSym {
    char *name;
469
    unsigned long pointer;
470 471
};

A
antirez 已提交
472 473 474 475 476 477 478 479 480 481 482 483 484
typedef struct _redisSortObject {
    robj *obj;
    union {
        double score;
        robj *cmpobj;
    } u;
} redisSortObject;

typedef struct _redisSortOperation {
    int type;
    robj *pattern;
} redisSortOperation;

485 486 487 488
/* ZSETs use a specialized version of Skiplists */

typedef struct zskiplistNode {
    struct zskiplistNode **forward;
489
    struct zskiplistNode *backward;
490
    unsigned int *span;
491 492 493 494 495
    double score;
    robj *obj;
} zskiplistNode;

typedef struct zskiplist {
496
    struct zskiplistNode *header, *tail;
497
    unsigned long length;
498 499 500
    int level;
} zskiplist;

501 502
typedef struct zset {
    dict *dict;
503
    zskiplist *zsl;
504 505
} zset;

506 507
/* Our shared "common" objects */

508
#define REDIS_SHARED_INTEGERS 10000
A
antirez 已提交
509
struct sharedObjectsStruct {
510
    robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
A
antirez 已提交
511
    *colon, *nullbulk, *nullmultibulk, *queued,
512 513
    *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
    *outofrangeerr, *plus,
A
antirez 已提交
514
    *select0, *select1, *select2, *select3, *select4,
A
antirez 已提交
515
    *select5, *select6, *select7, *select8, *select9,
A
antirez 已提交
516
    *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517
    *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
A
antirez 已提交
518 519
} shared;

520 521 522 523 524 525
/* Global vars that are actally used as constants. The following double
 * values are used for double on-disk serialization, and are initialized
 * at runtime to avoid strange compiler optimizations. */

static double R_Zero, R_PosInf, R_NegInf, R_Nan;

526
/* VM threaded I/O request message */
527 528 529
#define REDIS_IOJOB_LOAD 0          /* Load from disk to memory */
#define REDIS_IOJOB_PREPARE_SWAP 1  /* Compute needed pages */
#define REDIS_IOJOB_DO_SWAP 2       /* Swap from memory to disk */
A
antirez 已提交
530
typedef struct iojob {
531
    int type;   /* Request type, REDIS_IOJOB_* */
532
    redisDb *db;/* Redis database */
533
    robj *key;  /* This I/O request is about swapping this key */
534
    robj *val;  /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
535 536
                 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
    off_t page; /* Swap page where to read/write the object */
537
    off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
538 539 540
    int canceled; /* True if this command was canceled by blocking side of VM */
    pthread_t thread; /* ID of the thread processing this entry */
} iojob;
541

A
antirez 已提交
542 543 544 545 546 547 548 549
/*================================ Prototypes =============================== */

static void freeStringObject(robj *o);
static void freeListObject(robj *o);
static void freeSetObject(robj *o);
static void decrRefCount(void *o);
static robj *createObject(int type, void *ptr);
static void freeClient(redisClient *c);
550
static int rdbLoad(char *filename);
A
antirez 已提交
551 552 553
static void addReply(redisClient *c, robj *obj);
static void addReplySds(redisClient *c, sds s);
static void incrRefCount(robj *o);
554
static int rdbSaveBackground(char *filename);
A
antirez 已提交
555
static robj *createStringObject(char *ptr, size_t len);
A
antirez 已提交
556
static robj *dupStringObject(robj *o);
557
static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
558
static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
A
antirez 已提交
559
static int syncWithMaster(void);
560
static robj *tryObjectEncoding(robj *o);
561
static robj *getDecodedObject(robj *o);
A
antirez 已提交
562 563 564
static int removeExpire(redisDb *db, robj *key);
static int expireIfNeeded(redisDb *db, robj *key);
static int deleteIfVolatile(redisDb *db, robj *key);
A
antirez 已提交
565
static int deleteIfSwapped(redisDb *db, robj *key);
566
static int deleteKey(redisDb *db, robj *key);
A
antirez 已提交
567 568
static time_t getExpire(redisDb *db, robj *key);
static int setExpire(redisDb *db, robj *key, time_t when);
569
static void updateSlavesWaitingBgsave(int bgsaveerr);
A
antirez 已提交
570
static void freeMemoryIfNeeded(void);
571
static int processCommand(redisClient *c);
572
static void setupSigSegvAction(void);
573
static void rdbRemoveTempFile(pid_t childpid);
574
static void aofRemoveTempFile(pid_t childpid);
575
static size_t stringObjectLen(robj *o);
A
antirez 已提交
576
static void processInputBuffer(redisClient *c);
577
static zskiplist *zslCreate(void);
578
static void zslFree(zskiplist *zsl);
579
static void zslInsert(zskiplist *zsl, double score, robj *obj);
580
static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
A
antirez 已提交
581 582 583
static void initClientMultiState(redisClient *c);
static void freeClientMultiState(redisClient *c);
static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
584
static void unblockClientWaitingData(redisClient *c);
A
antirez 已提交
585
static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
586
static void vmInit(void);
A
antirez 已提交
587
static void vmMarkPagesFree(off_t page, off_t count);
A
antirez 已提交
588
static robj *vmLoadObject(robj *key);
A
antirez 已提交
589
static robj *vmPreviewObject(robj *key);
590 591
static int vmSwapOneObjectBlocking(void);
static int vmSwapOneObjectThreaded(void);
A
antirez 已提交
592
static int vmCanSwapOut(void);
593
static int tryFreeOneObjectFromFreelist(void);
594 595 596
static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
static void vmCancelThreadedIOJob(robj *o);
597 598 599 600 601
static void lockThreadedIO(void);
static void unlockThreadedIO(void);
static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
static void freeIOJob(iojob *j);
static void queueIOJob(iojob *j);
602 603
static int vmWriteObjectOnSwap(robj *o, off_t page);
static robj *vmReadObjectFromSwap(off_t page, int type);
604 605
static void waitEmptyIOJobsQueue(void);
static void vmReopenSwapFile(void);
606
static int vmFreePage(off_t page);
607
static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
A
antirez 已提交
608 609 610 611 612 613 614
static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
static int dontWaitForSwappedKey(redisClient *c, robj *key);
static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
static struct redisCommand *lookupCommand(char *name);
static void call(redisClient *c, struct redisCommand *cmd);
static void resetClient(redisClient *c);
615
static void convertToRealHash(robj *o);
A
antirez 已提交
616 617 618 619 620
static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
static void freePubsubPattern(void *p);
static int listMatchPubsubPattern(void *a, void *b);
static int compareStringObjects(robj *a, robj *b);
A
antirez 已提交
621
static void usage();
A
antirez 已提交
622

B
Brian Hammond 已提交
623
static void authCommand(redisClient *c);
A
antirez 已提交
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
static void pingCommand(redisClient *c);
static void echoCommand(redisClient *c);
static void setCommand(redisClient *c);
static void setnxCommand(redisClient *c);
static void getCommand(redisClient *c);
static void delCommand(redisClient *c);
static void existsCommand(redisClient *c);
static void incrCommand(redisClient *c);
static void decrCommand(redisClient *c);
static void incrbyCommand(redisClient *c);
static void decrbyCommand(redisClient *c);
static void selectCommand(redisClient *c);
static void randomkeyCommand(redisClient *c);
static void keysCommand(redisClient *c);
static void dbsizeCommand(redisClient *c);
static void lastsaveCommand(redisClient *c);
static void saveCommand(redisClient *c);
static void bgsaveCommand(redisClient *c);
642
static void bgrewriteaofCommand(redisClient *c);
A
antirez 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658
static void shutdownCommand(redisClient *c);
static void moveCommand(redisClient *c);
static void renameCommand(redisClient *c);
static void renamenxCommand(redisClient *c);
static void lpushCommand(redisClient *c);
static void rpushCommand(redisClient *c);
static void lpopCommand(redisClient *c);
static void rpopCommand(redisClient *c);
static void llenCommand(redisClient *c);
static void lindexCommand(redisClient *c);
static void lrangeCommand(redisClient *c);
static void ltrimCommand(redisClient *c);
static void typeCommand(redisClient *c);
static void lsetCommand(redisClient *c);
static void saddCommand(redisClient *c);
static void sremCommand(redisClient *c);
A
antirez 已提交
659
static void smoveCommand(redisClient *c);
A
antirez 已提交
660 661
static void sismemberCommand(redisClient *c);
static void scardCommand(redisClient *c);
662
static void spopCommand(redisClient *c);
A
antirez 已提交
663
static void srandmemberCommand(redisClient *c);
A
antirez 已提交
664 665
static void sinterCommand(redisClient *c);
static void sinterstoreCommand(redisClient *c);
666 667
static void sunionCommand(redisClient *c);
static void sunionstoreCommand(redisClient *c);
668 669
static void sdiffCommand(redisClient *c);
static void sdiffstoreCommand(redisClient *c);
A
antirez 已提交
670 671 672 673 674
static void syncCommand(redisClient *c);
static void flushdbCommand(redisClient *c);
static void flushallCommand(redisClient *c);
static void sortCommand(redisClient *c);
static void lremCommand(redisClient *c);
A
antirez 已提交
675
static void rpoplpushcommand(redisClient *c);
A
antirez 已提交
676
static void infoCommand(redisClient *c);
677
static void mgetCommand(redisClient *c);
A
antirez 已提交
678
static void monitorCommand(redisClient *c);
A
antirez 已提交
679
static void expireCommand(redisClient *c);
680
static void expireatCommand(redisClient *c);
A
antirez 已提交
681
static void getsetCommand(redisClient *c);
A
antirez 已提交
682
static void ttlCommand(redisClient *c);
683
static void slaveofCommand(redisClient *c);
684
static void debugCommand(redisClient *c);
A
antirez 已提交
685 686
static void msetCommand(redisClient *c);
static void msetnxCommand(redisClient *c);
687
static void zaddCommand(redisClient *c);
A
antirez 已提交
688
static void zincrbyCommand(redisClient *c);
689
static void zrangeCommand(redisClient *c);
690
static void zrangebyscoreCommand(redisClient *c);
691
static void zcountCommand(redisClient *c);
692
static void zrevrangeCommand(redisClient *c);
693
static void zcardCommand(redisClient *c);
A
antirez 已提交
694
static void zremCommand(redisClient *c);
A
antirez 已提交
695
static void zscoreCommand(redisClient *c);
696
static void zremrangebyscoreCommand(redisClient *c);
A
antirez 已提交
697 698
static void multiCommand(redisClient *c);
static void execCommand(redisClient *c);
699
static void discardCommand(redisClient *c);
A
antirez 已提交
700 701
static void blpopCommand(redisClient *c);
static void brpopCommand(redisClient *c);
A
antirez 已提交
702
static void appendCommand(redisClient *c);
A
antirez 已提交
703
static void substrCommand(redisClient *c);
704
static void zrankCommand(redisClient *c);
P
Pieter Noordhuis 已提交
705
static void zrevrankCommand(redisClient *c);
706 707
static void hsetCommand(redisClient *c);
static void hgetCommand(redisClient *c);
P
Pieter Noordhuis 已提交
708 709
static void hmsetCommand(redisClient *c);
static void hmgetCommand(redisClient *c);
710
static void hdelCommand(redisClient *c);
711
static void hlenCommand(redisClient *c);
P
Pieter Noordhuis 已提交
712
static void zremrangebyrankCommand(redisClient *c);
713 714
static void zunionCommand(redisClient *c);
static void zinterCommand(redisClient *c);
A
antirez 已提交
715 716 717
static void hkeysCommand(redisClient *c);
static void hvalsCommand(redisClient *c);
static void hgetallCommand(redisClient *c);
A
antirez 已提交
718
static void hexistsCommand(redisClient *c);
719
static void configCommand(redisClient *c);
720
static void hincrbyCommand(redisClient *c);
A
antirez 已提交
721 722
static void subscribeCommand(redisClient *c);
static void unsubscribeCommand(redisClient *c);
A
antirez 已提交
723 724
static void psubscribeCommand(redisClient *c);
static void punsubscribeCommand(redisClient *c);
A
antirez 已提交
725
static void publishCommand(redisClient *c);
A
antirez 已提交
726

A
antirez 已提交
727 728 729 730 731
/*================================= Globals ================================= */

/* Global vars */
static struct redisServer server; /* server global state */
static struct redisCommand cmdTable[] = {
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
    {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
    {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
    {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
    {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
    {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
    {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
    {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
    {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
    {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
    {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
    {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
    {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
    {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
    {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
    {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
P
Pieter Noordhuis 已提交
785
    {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
P
Pieter Noordhuis 已提交
786
    {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
P
Pieter Noordhuis 已提交
787
    {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
788
    {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
789 790 791 792 793
    {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
    {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794
    {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
    {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
    {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
    {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
    {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
    {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
    {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
    {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
A
antirez 已提交
819
    {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
820 821 822 823 824 825 826 827 828 829
    {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
    {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
    {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
    {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
    {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830
    {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
A
antirez 已提交
831 832
    {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
A
antirez 已提交
833 834
    {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
    {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
835
    {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
836
    {NULL,NULL,0,0,NULL,0,0,0}
A
antirez 已提交
837
};
838

A
antirez 已提交
839 840 841
/*============================ Utility functions ============================ */

/* Glob-style pattern matching. */
842
static int stringmatchlen(const char *pattern, int patternLen,
A
antirez 已提交
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963
        const char *string, int stringLen, int nocase)
{
    while(patternLen) {
        switch(pattern[0]) {
        case '*':
            while (pattern[1] == '*') {
                pattern++;
                patternLen--;
            }
            if (patternLen == 1)
                return 1; /* match */
            while(stringLen) {
                if (stringmatchlen(pattern+1, patternLen-1,
                            string, stringLen, nocase))
                    return 1; /* match */
                string++;
                stringLen--;
            }
            return 0; /* no match */
            break;
        case '?':
            if (stringLen == 0)
                return 0; /* no match */
            string++;
            stringLen--;
            break;
        case '[':
        {
            int not, match;

            pattern++;
            patternLen--;
            not = pattern[0] == '^';
            if (not) {
                pattern++;
                patternLen--;
            }
            match = 0;
            while(1) {
                if (pattern[0] == '\\') {
                    pattern++;
                    patternLen--;
                    if (pattern[0] == string[0])
                        match = 1;
                } else if (pattern[0] == ']') {
                    break;
                } else if (patternLen == 0) {
                    pattern--;
                    patternLen++;
                    break;
                } else if (pattern[1] == '-' && patternLen >= 3) {
                    int start = pattern[0];
                    int end = pattern[2];
                    int c = string[0];
                    if (start > end) {
                        int t = start;
                        start = end;
                        end = t;
                    }
                    if (nocase) {
                        start = tolower(start);
                        end = tolower(end);
                        c = tolower(c);
                    }
                    pattern += 2;
                    patternLen -= 2;
                    if (c >= start && c <= end)
                        match = 1;
                } else {
                    if (!nocase) {
                        if (pattern[0] == string[0])
                            match = 1;
                    } else {
                        if (tolower((int)pattern[0]) == tolower((int)string[0]))
                            match = 1;
                    }
                }
                pattern++;
                patternLen--;
            }
            if (not)
                match = !match;
            if (!match)
                return 0; /* no match */
            string++;
            stringLen--;
            break;
        }
        case '\\':
            if (patternLen >= 2) {
                pattern++;
                patternLen--;
            }
            /* fall through */
        default:
            if (!nocase) {
                if (pattern[0] != string[0])
                    return 0; /* no match */
            } else {
                if (tolower((int)pattern[0]) != tolower((int)string[0]))
                    return 0; /* no match */
            }
            string++;
            stringLen--;
            break;
        }
        pattern++;
        patternLen--;
        if (stringLen == 0) {
            while(*pattern == '*') {
                pattern++;
                patternLen--;
            }
            break;
        }
    }
    if (patternLen == 0 && stringLen == 0)
        return 1;
    return 0;
}

964 965 966 967
static int stringmatch(const char *pattern, const char *string, int nocase) {
    return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
}

968
static void redisLog(int level, const char *fmt, ...) {
A
antirez 已提交
969 970 971 972 973 974 975 976
    va_list ap;
    FILE *fp;

    fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
    if (!fp) return;

    va_start(ap, fmt);
    if (level >= server.verbosity) {
977
        char *c = ".-*#";
A
antirez 已提交
978 979 980 981
        char buf[64];
        time_t now;

        now = time(NULL);
982
        strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
983
        fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
A
antirez 已提交
984 985 986 987 988 989 990 991 992 993 994 995 996 997 998
        vfprintf(fp, fmt, ap);
        fprintf(fp,"\n");
        fflush(fp);
    }
    va_end(ap);

    if (server.logfile) fclose(fp);
}

/*====================== Hash table type implementation  ==================== */

/* This is an hash table type that uses the SDS dynamic strings libary as
 * keys and radis objects as values (objects can hold SDS strings,
 * lists, sets). */

999 1000 1001 1002 1003 1004
static void dictVanillaFree(void *privdata, void *val)
{
    DICT_NOTUSED(privdata);
    zfree(val);
}

A
antirez 已提交
1005 1006 1007 1008 1009 1010
static void dictListDestructor(void *privdata, void *val)
{
    DICT_NOTUSED(privdata);
    listRelease((list*)val);
}

A
antirez 已提交
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
static int sdsDictKeyCompare(void *privdata, const void *key1,
        const void *key2)
{
    int l1,l2;
    DICT_NOTUSED(privdata);

    l1 = sdslen((sds)key1);
    l2 = sdslen((sds)key2);
    if (l1 != l2) return 0;
    return memcmp(key1, key2, l1) == 0;
}

static void dictRedisObjectDestructor(void *privdata, void *val)
{
    DICT_NOTUSED(privdata);

A
antirez 已提交
1027
    if (val == NULL) return; /* Values of swapped out keys as set to NULL */
A
antirez 已提交
1028 1029 1030
    decrRefCount(val);
}

1031
static int dictObjKeyCompare(void *privdata, const void *key1,
A
antirez 已提交
1032 1033 1034 1035 1036 1037
        const void *key2)
{
    const robj *o1 = key1, *o2 = key2;
    return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
}

1038
static unsigned int dictObjHash(const void *key) {
A
antirez 已提交
1039 1040 1041 1042
    const robj *o = key;
    return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
}

1043 1044 1045
static int dictEncObjKeyCompare(void *privdata, const void *key1,
        const void *key2)
{
1046 1047
    robj *o1 = (robj*) key1, *o2 = (robj*) key2;
    int cmp;
1048

1049 1050
    if (o1->encoding == REDIS_ENCODING_INT &&
        o2->encoding == REDIS_ENCODING_INT &&
A
antirez 已提交
1051
        o1->ptr == o2->ptr) return 1;
1052

1053 1054 1055 1056 1057 1058
    o1 = getDecodedObject(o1);
    o2 = getDecodedObject(o2);
    cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
    decrRefCount(o1);
    decrRefCount(o2);
    return cmp;
1059 1060 1061
}

static unsigned int dictEncObjHash(const void *key) {
1062
    robj *o = (robj*) key;
1063

1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
    if (o->encoding == REDIS_ENCODING_RAW) {
        return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
    } else {
        if (o->encoding == REDIS_ENCODING_INT) {
            char buf[32];
            int len;

            len = snprintf(buf,32,"%ld",(long)o->ptr);
            return dictGenHashFunction((unsigned char*)buf, len);
        } else {
            unsigned int hash;

            o = getDecodedObject(o);
            hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
            decrRefCount(o);
            return hash;
        }
    }
1082 1083
}

A
antirez 已提交
1084
/* Sets type and expires */
A
antirez 已提交
1085
static dictType setDictType = {
1086
    dictEncObjHash,            /* hash function */
A
antirez 已提交
1087 1088
    NULL,                      /* key dup */
    NULL,                      /* val dup */
1089
    dictEncObjKeyCompare,      /* key compare */
A
antirez 已提交
1090 1091 1092 1093
    dictRedisObjectDestructor, /* key destructor */
    NULL                       /* val destructor */
};

A
antirez 已提交
1094
/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1095 1096 1097 1098 1099 1100
static dictType zsetDictType = {
    dictEncObjHash,            /* hash function */
    NULL,                      /* key dup */
    NULL,                      /* val dup */
    dictEncObjKeyCompare,      /* key compare */
    dictRedisObjectDestructor, /* key destructor */
1101
    dictVanillaFree            /* val destructor of malloc(sizeof(double)) */
1102 1103
};

A
antirez 已提交
1104
/* Db->dict */
1105
static dictType dbDictType = {
1106
    dictObjHash,                /* hash function */
A
antirez 已提交
1107 1108
    NULL,                       /* key dup */
    NULL,                       /* val dup */
1109
    dictObjKeyCompare,          /* key compare */
A
antirez 已提交
1110 1111 1112 1113
    dictRedisObjectDestructor,  /* key destructor */
    dictRedisObjectDestructor   /* val destructor */
};

A
antirez 已提交
1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
/* Db->expires */
static dictType keyptrDictType = {
    dictObjHash,               /* hash function */
    NULL,                      /* key dup */
    NULL,                      /* val dup */
    dictObjKeyCompare,         /* key compare */
    dictRedisObjectDestructor, /* key destructor */
    NULL                       /* val destructor */
};

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
/* Hash type hash table (note that small hashes are represented with zimpaps) */
static dictType hashDictType = {
    dictEncObjHash,             /* hash function */
    NULL,                       /* key dup */
    NULL,                       /* val dup */
    dictEncObjKeyCompare,       /* key compare */
    dictRedisObjectDestructor,  /* key destructor */
    dictRedisObjectDestructor   /* val destructor */
};

A
antirez 已提交
1134
/* Keylist hash table type has unencoded redis objects as keys and
A
antirez 已提交
1135 1136
 * lists as values. It's used for blocking operations (BLPOP) and to
 * map swapped keys to a list of clients waiting for this keys to be loaded. */
A
antirez 已提交
1137 1138 1139 1140 1141 1142 1143 1144 1145
static dictType keylistDictType = {
    dictObjHash,                /* hash function */
    NULL,                       /* key dup */
    NULL,                       /* val dup */
    dictObjKeyCompare,          /* key compare */
    dictRedisObjectDestructor,  /* key destructor */
    dictListDestructor          /* val destructor */
};

1146 1147
static void version();

A
antirez 已提交
1148 1149 1150 1151 1152 1153 1154 1155
/* ========================= Random utility functions ======================= */

/* Redis generally does not try to recover from out of memory conditions
 * when allocating objects or strings, it is not clear if it will be possible
 * to report this condition to the client since the networking layer itself
 * is based on heap allocation for send buffers, so we simply abort.
 * At least the code will be simpler to read... */
static void oom(const char *msg) {
1156
    redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
A
antirez 已提交
1157 1158 1159 1160 1161
    sleep(1);
    abort();
}

/* ====================== Redis server networking stuff ===================== */
1162
static void closeTimedoutClients(void) {
A
antirez 已提交
1163 1164 1165
    redisClient *c;
    listNode *ln;
    time_t now = time(NULL);
A
antirez 已提交
1166
    listIter li;
A
antirez 已提交
1167

A
antirez 已提交
1168 1169
    listRewind(server.clients,&li);
    while ((ln = listNext(&li)) != NULL) {
A
antirez 已提交
1170
        c = listNodeValue(ln);
A
antirez 已提交
1171 1172
        if (server.maxidletime &&
            !(c->flags & REDIS_SLAVE) &&    /* no timeout for slaves */
1173
            !(c->flags & REDIS_MASTER) &&   /* no timeout for masters */
A
antirez 已提交
1174 1175
            dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
            listLength(c->pubsub_patterns) == 0 &&
1176
            (now - c->lastinteraction > server.maxidletime))
A
antirez 已提交
1177
        {
1178
            redisLog(REDIS_VERBOSE,"Closing idle client");
A
antirez 已提交
1179
            freeClient(c);
A
antirez 已提交
1180
        } else if (c->flags & REDIS_BLOCKED) {
1181
            if (c->blockingto != 0 && c->blockingto < now) {
1182
                addReply(c,shared.nullmultibulk);
1183
                unblockClientWaitingData(c);
A
antirez 已提交
1184
            }
A
antirez 已提交
1185 1186 1187 1188
        }
    }
}

1189 1190 1191 1192 1193 1194 1195 1196 1197
static int htNeedsResize(dict *dict) {
    long long size, used;

    size = dictSlots(dict);
    used = dictSize(dict);
    return (size && used && size > DICT_HT_INITIAL_SIZE &&
            (used*100/size < REDIS_HT_MINFILL));
}

1198 1199
/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
 * we resize the hash table to save memory */
1200
static void tryResizeHashTables(void) {
1201 1202 1203
    int j;

    for (j = 0; j < server.dbnum; j++) {
1204
        if (htNeedsResize(server.db[j].dict)) {
1205
            redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
1206
            dictResize(server.db[j].dict);
1207
            redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
1208
        }
1209 1210
        if (htNeedsResize(server.db[j].expires))
            dictResize(server.db[j].expires);
1211 1212 1213
    }
}

1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
/* A background saving child (BGSAVE) terminated its work. Handle this. */
void backgroundSaveDoneHandler(int statloc) {
    int exitcode = WEXITSTATUS(statloc);
    int bysignal = WIFSIGNALED(statloc);

    if (!bysignal && exitcode == 0) {
        redisLog(REDIS_NOTICE,
            "Background saving terminated with success");
        server.dirty = 0;
        server.lastsave = time(NULL);
    } else if (!bysignal && exitcode != 0) {
        redisLog(REDIS_WARNING, "Background saving error");
    } else {
        redisLog(REDIS_WARNING,
1228
            "Background saving terminated by signal %d", WTERMSIG(statloc));
1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262
        rdbRemoveTempFile(server.bgsavechildpid);
    }
    server.bgsavechildpid = -1;
    /* Possibly there are slaves waiting for a BGSAVE in order to be served
     * (the first stage of SYNC is a bulk transfer of dump.rdb) */
    updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
}

/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
 * Handle this. */
void backgroundRewriteDoneHandler(int statloc) {
    int exitcode = WEXITSTATUS(statloc);
    int bysignal = WIFSIGNALED(statloc);

    if (!bysignal && exitcode == 0) {
        int fd;
        char tmpfile[256];

        redisLog(REDIS_NOTICE,
            "Background append only file rewriting terminated with success");
        /* Now it's time to flush the differences accumulated by the parent */
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
        fd = open(tmpfile,O_WRONLY|O_APPEND);
        if (fd == -1) {
            redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
            goto cleanup;
        }
        /* Flush our data... */
        if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
                (signed) sdslen(server.bgrewritebuf)) {
            redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
            close(fd);
            goto cleanup;
        }
1263
        redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
        /* Now our work is to rename the temp file into the stable file. And
         * switch the file descriptor used by the server for append only. */
        if (rename(tmpfile,server.appendfilename) == -1) {
            redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
            close(fd);
            goto cleanup;
        }
        /* Mission completed... almost */
        redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
        if (server.appendfd != -1) {
            /* If append only is actually enabled... */
            close(server.appendfd);
            server.appendfd = fd;
            fsync(fd);
A
antirez 已提交
1278
            server.appendseldb = -1; /* Make sure it will issue SELECT */
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
            redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
        } else {
            /* If append only is disabled we just generate a dump in this
             * format. Why not? */
            close(fd);
        }
    } else if (!bysignal && exitcode != 0) {
        redisLog(REDIS_WARNING, "Background append only file rewriting error");
    } else {
        redisLog(REDIS_WARNING,
1289 1290
            "Background append only file rewriting terminated by signal %d",
            WTERMSIG(statloc));
1291 1292 1293 1294 1295 1296 1297 1298
    }
cleanup:
    sdsfree(server.bgrewritebuf);
    server.bgrewritebuf = sdsempty();
    aofRemoveTempFile(server.bgrewritechildpid);
    server.bgrewritechildpid = -1;
}

1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
/* This function is called once a background process of some kind terminates,
 * as we want to avoid resizing the hash tables when there is a child in order
 * to play well with copy-on-write (otherwise when a resize happens lots of
 * memory pages are copied). The goal of this function is to update the ability
 * for dict.c to resize the hash tables accordingly to the fact we have o not
 * running childs. */
static void updateDictResizePolicy(void) {
    if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
        dictEnableResize();
    else
        dictDisableResize();
}

1312
static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
1313
    int j, loops = server.cronloops++;
A
antirez 已提交
1314 1315 1316 1317
    REDIS_NOTUSED(eventLoop);
    REDIS_NOTUSED(id);
    REDIS_NOTUSED(clientData);

A
antirez 已提交
1318 1319 1320 1321 1322 1323
    /* We take a cached value of the unix time in the global state because
     * with virtual memory and aging there is to store the current time
     * in objects at every object access, and accuracy is not needed.
     * To access a global var is faster than calling time(NULL) */
    server.unixtime = time(NULL);

1324
    /* Show some info about non-empty databases */
A
antirez 已提交
1325
    for (j = 0; j < server.dbnum; j++) {
1326
        long long size, used, vkeys;
1327

A
antirez 已提交
1328 1329
        size = dictSlots(server.db[j].dict);
        used = dictSize(server.db[j].dict);
1330
        vkeys = dictSize(server.db[j].expires);
1331
        if (!(loops % 50) && (used || vkeys)) {
1332
            redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
A
antirez 已提交
1333
            /* dictPrintStats(server.dict); */
A
antirez 已提交
1334 1335 1336
        }
    }

1337 1338 1339 1340 1341 1342
    /* We don't want to resize the hash tables while a bacground saving
     * is in progress: the saving child is created using fork() that is
     * implemented with a copy-on-write semantic in most modern systems, so
     * if we resize the HT while there is the saving child at work actually
     * a lot of memory movements in the parent will cause a lot of pages
     * copied. */
1343 1344 1345 1346 1347
    if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
        !(loops % 10))
    {
        tryResizeHashTables();
    }
1348

A
antirez 已提交
1349
    /* Show information about connected clients */
1350
    if (!(loops % 50)) {
1351
        redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
A
antirez 已提交
1352 1353
            listLength(server.clients)-listLength(server.slaves),
            listLength(server.slaves),
1354
            zmalloc_used_memory());
A
antirez 已提交
1355 1356 1357
    }

    /* Close connections of timedout clients */
1358
    if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
A
antirez 已提交
1359 1360
        closeTimedoutClients();

1361 1362
    /* Check if a background saving or AOF rewrite in progress terminated */
    if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
A
antirez 已提交
1363
        int statloc;
1364 1365 1366 1367 1368
        pid_t pid;

        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
            if (pid == server.bgsavechildpid) {
                backgroundSaveDoneHandler(statloc);
A
antirez 已提交
1369
            } else {
1370
                backgroundRewriteDoneHandler(statloc);
A
antirez 已提交
1371
            }
1372
            updateDictResizePolicy();
A
antirez 已提交
1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
        }
    } else {
        /* If there is not a background saving in progress check if
         * we have to save now */
         time_t now = time(NULL);
         for (j = 0; j < server.saveparamslen; j++) {
            struct saveparam *sp = server.saveparams+j;

            if (server.dirty >= sp->changes &&
                now-server.lastsave > sp->seconds) {
                redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
                    sp->changes, sp->seconds);
1385
                rdbSaveBackground(server.dbfilename);
A
antirez 已提交
1386 1387 1388 1389
                break;
            }
         }
    }
1390

1391 1392 1393 1394
    /* Try to expire a few timed out keys. The algorithm used is adaptive and
     * will use few CPU cycles if there are few expiring keys, otherwise
     * it will get more aggressive to avoid that too much memory is used by
     * keys that can be removed from the keyspace. */
1395
    for (j = 0; j < server.dbnum; j++) {
1396
        int expired;
1397 1398
        redisDb *db = server.db+j;

1399 1400 1401
        /* Continue to expire if at the end of the cycle more than 25%
         * of the keys were expired. */
        do {
A
antirez 已提交
1402
            long num = dictSize(db->expires);
1403 1404
            time_t now = time(NULL);

1405
            expired = 0;
1406 1407 1408 1409 1410 1411 1412 1413 1414 1415
            if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
                num = REDIS_EXPIRELOOKUPS_PER_CRON;
            while (num--) {
                dictEntry *de;
                time_t t;

                if ((de = dictGetRandomKey(db->expires)) == NULL) break;
                t = (time_t) dictGetEntryVal(de);
                if (now > t) {
                    deleteKey(db,dictGetEntryKey(de));
1416
                    expired++;
A
antirez 已提交
1417
                    server.stat_expiredkeys++;
1418 1419
                }
            }
1420
        } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
1421 1422
    }

A
antirez 已提交
1423
    /* Swap a few keys on disk if we are over the memory limit and VM
1424
     * is enbled. Try to free objects from the free list first. */
A
antirez 已提交
1425 1426
    if (vmCanSwapOut()) {
        while (server.vm_enabled && zmalloc_used_memory() >
1427 1428
                server.vm_max_memory)
        {
A
antirez 已提交
1429 1430
            int retval;

1431
            if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
A
antirez 已提交
1432 1433 1434
            retval = (server.vm_max_threads == 0) ?
                        vmSwapOneObjectBlocking() :
                        vmSwapOneObjectThreaded();
1435
            if (retval == REDIS_ERR && !(loops % 300) &&
A
antirez 已提交
1436 1437 1438 1439
                zmalloc_used_memory() >
                (server.vm_max_memory+server.vm_max_memory/10))
            {
                redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
A
antirez 已提交
1440
            }
A
antirez 已提交
1441 1442 1443 1444 1445
            /* Note that when using threade I/O we free just one object,
             * because anyway when the I/O thread in charge to swap this
             * object out will finish, the handler of completed jobs
             * will try to swap more objects if we are still out of memory. */
            if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
A
antirez 已提交
1446 1447 1448
        }
    }

A
antirez 已提交
1449
    /* Check if we should connect to a MASTER */
1450
    if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
A
antirez 已提交
1451 1452 1453 1454 1455
        redisLog(REDIS_NOTICE,"Connecting to MASTER...");
        if (syncWithMaster() == REDIS_OK) {
            redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
        }
    }
1456
    return 100;
A
antirez 已提交
1457 1458
}

A
antirez 已提交
1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
/* This function gets called every time Redis is entering the
 * main loop of the event driven library, that is, before to sleep
 * for ready file descriptors. */
static void beforeSleep(struct aeEventLoop *eventLoop) {
    REDIS_NOTUSED(eventLoop);

    if (server.vm_enabled && listLength(server.io_ready_clients)) {
        listIter li;
        listNode *ln;

        listRewind(server.io_ready_clients,&li);
        while((ln = listNext(&li))) {
            redisClient *c = ln->value;
            struct redisCommand *cmd;

            /* Resume the client. */
            listDelNode(server.io_ready_clients,ln);
            c->flags &= (~REDIS_IO_WAIT);
            server.vm_blocked_clients--;
            aeCreateFileEvent(server.el, c->fd, AE_READABLE,
                readQueryFromClient, c);
            cmd = lookupCommand(c->argv[0]->ptr);
            assert(cmd != NULL);
            call(c,cmd);
            resetClient(c);
            /* There may be more data to process in the input buffer. */
            if (c->querybuf && sdslen(c->querybuf) > 0)
                processInputBuffer(c);
        }
    }
}

A
antirez 已提交
1491
static void createSharedObjects(void) {
1492 1493
    int j;

A
antirez 已提交
1494 1495 1496
    shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
    shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
    shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
1497 1498 1499 1500 1501 1502
    shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
    shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
    shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
    shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
    shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
    shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
A
antirez 已提交
1503
    shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
A
antirez 已提交
1504
    shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
A
antirez 已提交
1505 1506 1507 1508 1509 1510
    shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
        "-ERR Operation against a key holding the wrong kind of value\r\n"));
    shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
        "-ERR no such key\r\n"));
    shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
        "-ERR syntax error\r\n"));
1511 1512 1513 1514
    shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
        "-ERR source and destination objects are the same\r\n"));
    shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
        "-ERR index out of range\r\n"));
A
antirez 已提交
1515
    shared.space = createObject(REDIS_STRING,sdsnew(" "));
1516 1517
    shared.colon = createObject(REDIS_STRING,sdsnew(":"));
    shared.plus = createObject(REDIS_STRING,sdsnew("+"));
A
antirez 已提交
1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
    shared.select0 = createStringObject("select 0\r\n",10);
    shared.select1 = createStringObject("select 1\r\n",10);
    shared.select2 = createStringObject("select 2\r\n",10);
    shared.select3 = createStringObject("select 3\r\n",10);
    shared.select4 = createStringObject("select 4\r\n",10);
    shared.select5 = createStringObject("select 5\r\n",10);
    shared.select6 = createStringObject("select 6\r\n",10);
    shared.select7 = createStringObject("select 7\r\n",10);
    shared.select8 = createStringObject("select 8\r\n",10);
    shared.select9 = createStringObject("select 9\r\n",10);
A
antirez 已提交
1528 1529
    shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
    shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
A
antirez 已提交
1530
    shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
A
antirez 已提交
1531 1532
    shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
    shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
A
antirez 已提交
1533
    shared.mbulk3 = createStringObject("*3\r\n",4);
1534 1535 1536 1537
    for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
        shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
        shared.integers[j]->encoding = REDIS_ENCODING_INT;
    }
A
antirez 已提交
1538 1539 1540 1541 1542 1543 1544 1545 1546
}

static void appendServerSaveParams(time_t seconds, int changes) {
    server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
    server.saveparams[server.saveparamslen].seconds = seconds;
    server.saveparams[server.saveparamslen].changes = changes;
    server.saveparamslen++;
}

1547
static void resetServerSaveParams() {
A
antirez 已提交
1548 1549 1550 1551 1552 1553 1554 1555
    zfree(server.saveparams);
    server.saveparams = NULL;
    server.saveparamslen = 0;
}

static void initServerConfig() {
    server.dbnum = REDIS_DEFAULT_DBNUM;
    server.port = REDIS_SERVERPORT;
1556
    server.verbosity = REDIS_VERBOSE;
A
antirez 已提交
1557 1558 1559 1560 1561 1562
    server.maxidletime = REDIS_MAXIDLETIME;
    server.saveparams = NULL;
    server.logfile = NULL; /* NULL = log on standard output */
    server.bindaddr = NULL;
    server.glueoutputbuf = 1;
    server.daemonize = 0;
1563
    server.appendonly = 0;
1564
    server.appendfsync = APPENDFSYNC_ALWAYS;
1565
    server.lastfsync = time(NULL);
1566 1567
    server.appendfd = -1;
    server.appendseldb = -1; /* Make sure the first time will not match */
1568 1569 1570
    server.pidfile = zstrdup("/var/run/redis.pid");
    server.dbfilename = zstrdup("dump.rdb");
    server.appendfilename = zstrdup("appendonly.aof");
B
Brian Hammond 已提交
1571
    server.requirepass = NULL;
1572
    server.shareobjects = 0;
1573
    server.rdbcompression = 1;
1574
    server.maxclients = 0;
A
antirez 已提交
1575
    server.blpop_blocked_clients = 0;
A
antirez 已提交
1576
    server.maxmemory = 0;
1577
    server.vm_enabled = 0;
1578
    server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
1579 1580 1581
    server.vm_page_size = 256;          /* 256 bytes per page */
    server.vm_pages = 1024*1024*100;    /* 104 millions of pages */
    server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
1582
    server.vm_max_threads = 4;
A
antirez 已提交
1583
    server.vm_blocked_clients = 0;
1584 1585
    server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
    server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
1586

1587
    resetServerSaveParams();
A
antirez 已提交
1588 1589 1590 1591 1592 1593

    appendServerSaveParams(60*60,1);  /* save after 1 hour and 1 change */
    appendServerSaveParams(300,100);  /* save after 5 minutes and 100 changes */
    appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
    /* Replication related */
    server.isslave = 0;
1594
    server.masterauth = NULL;
A
antirez 已提交
1595 1596 1597 1598
    server.masterhost = NULL;
    server.masterport = 6379;
    server.master = NULL;
    server.replstate = REDIS_REPL_NONE;
1599 1600 1601 1602 1603 1604

    /* Double constants initialization */
    R_Zero = 0.0;
    R_PosInf = 1.0/R_Zero;
    R_NegInf = -1.0/R_Zero;
    R_Nan = R_Zero/R_Zero;
A
antirez 已提交
1605 1606 1607 1608 1609 1610 1611
}

static void initServer() {
    int j;

    signal(SIGHUP, SIG_IGN);
    signal(SIGPIPE, SIG_IGN);
H
hrothgar 已提交
1612
    setupSigSegvAction();
A
antirez 已提交
1613

1614 1615 1616 1617 1618
    server.devnull = fopen("/dev/null","w");
    if (server.devnull == NULL) {
        redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
        exit(1);
    }
A
antirez 已提交
1619 1620
    server.clients = listCreate();
    server.slaves = listCreate();
A
antirez 已提交
1621
    server.monitors = listCreate();
A
antirez 已提交
1622 1623 1624
    server.objfreelist = listCreate();
    createSharedObjects();
    server.el = aeCreateEventLoop();
A
antirez 已提交
1625
    server.db = zmalloc(sizeof(redisDb)*server.dbnum);
A
antirez 已提交
1626 1627 1628 1629 1630
    server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
    if (server.fd == -1) {
        redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
        exit(1);
    }
A
antirez 已提交
1631
    for (j = 0; j < server.dbnum; j++) {
1632
        server.db[j].dict = dictCreate(&dbDictType,NULL);
A
antirez 已提交
1633
        server.db[j].expires = dictCreate(&keyptrDictType,NULL);
A
antirez 已提交
1634
        server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
A
antirez 已提交
1635 1636
        if (server.vm_enabled)
            server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
A
antirez 已提交
1637 1638
        server.db[j].id = j;
    }
A
antirez 已提交
1639 1640 1641 1642
    server.pubsub_channels = dictCreate(&keylistDictType,NULL);
    server.pubsub_patterns = listCreate();
    listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
    listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
A
antirez 已提交
1643
    server.cronloops = 0;
1644
    server.bgsavechildpid = -1;
1645 1646
    server.bgrewritechildpid = -1;
    server.bgrewritebuf = sdsempty();
A
antirez 已提交
1647 1648 1649 1650
    server.lastsave = time(NULL);
    server.dirty = 0;
    server.stat_numcommands = 0;
    server.stat_numconnections = 0;
A
antirez 已提交
1651
    server.stat_expiredkeys = 0;
A
antirez 已提交
1652
    server.stat_starttime = time(NULL);
A
antirez 已提交
1653
    server.unixtime = time(NULL);
1654
    aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
1655 1656
    if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
        acceptHandler, NULL) == AE_ERR) oom("creating file event");
1657 1658

    if (server.appendonly) {
A
antirez 已提交
1659
        server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
1660 1661 1662 1663 1664 1665
        if (server.appendfd == -1) {
            redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
                strerror(errno));
            exit(1);
        }
    }
1666 1667

    if (server.vm_enabled) vmInit();
A
antirez 已提交
1668 1669 1670
}

/* Empty the whole database */
1671
static long long emptyDb() {
A
antirez 已提交
1672
    int j;
1673
    long long removed = 0;
A
antirez 已提交
1674

A
antirez 已提交
1675
    for (j = 0; j < server.dbnum; j++) {
1676
        removed += dictSize(server.db[j].dict);
A
antirez 已提交
1677 1678 1679
        dictEmpty(server.db[j].dict);
        dictEmpty(server.db[j].expires);
    }
1680
    return removed;
A
antirez 已提交
1681 1682
}

A
antirez 已提交
1683 1684 1685 1686 1687 1688
static int yesnotoi(char *s) {
    if (!strcasecmp(s,"yes")) return 1;
    else if (!strcasecmp(s,"no")) return 0;
    else return -1;
}

A
antirez 已提交
1689 1690 1691
/* I agree, this is a very rudimental way to load a configuration...
   will improve later if the config gets more complex */
static void loadServerConfig(char *filename) {
1692
    FILE *fp;
A
antirez 已提交
1693 1694 1695
    char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
    int linenum = 0;
    sds line = NULL;
1696 1697 1698 1699 1700

    if (filename[0] == '-' && filename[1] == '\0')
        fp = stdin;
    else {
        if ((fp = fopen(filename,"r")) == NULL) {
1701
            redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
1702 1703
            exit(1);
        }
A
antirez 已提交
1704
    }
1705

A
antirez 已提交
1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724
    while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
        sds *argv;
        int argc, j;

        linenum++;
        line = sdsnew(buf);
        line = sdstrim(line," \t\r\n");

        /* Skip comments and blank lines*/
        if (line[0] == '#' || line[0] == '\0') {
            sdsfree(line);
            continue;
        }

        /* Split into arguments */
        argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
        sdstolower(argv[0]);

        /* Execute config directives */
1725
        if (!strcasecmp(argv[0],"timeout") && argc == 2) {
A
antirez 已提交
1726
            server.maxidletime = atoi(argv[1]);
1727
            if (server.maxidletime < 0) {
A
antirez 已提交
1728 1729
                err = "Invalid timeout value"; goto loaderr;
            }
1730
        } else if (!strcasecmp(argv[0],"port") && argc == 2) {
A
antirez 已提交
1731 1732 1733 1734
            server.port = atoi(argv[1]);
            if (server.port < 1 || server.port > 65535) {
                err = "Invalid port"; goto loaderr;
            }
1735
        } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
A
antirez 已提交
1736
            server.bindaddr = zstrdup(argv[1]);
1737
        } else if (!strcasecmp(argv[0],"save") && argc == 3) {
A
antirez 已提交
1738 1739 1740 1741 1742 1743
            int seconds = atoi(argv[1]);
            int changes = atoi(argv[2]);
            if (seconds < 1 || changes < 0) {
                err = "Invalid save parameters"; goto loaderr;
            }
            appendServerSaveParams(seconds,changes);
1744
        } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
A
antirez 已提交
1745 1746 1747 1748 1749
            if (chdir(argv[1]) == -1) {
                redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
                    argv[1], strerror(errno));
                exit(1);
            }
1750 1751
        } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
            if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
1752
            else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
1753 1754
            else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
            else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
A
antirez 已提交
1755 1756 1757 1758
            else {
                err = "Invalid log level. Must be one of debug, notice, warning";
                goto loaderr;
            }
1759
        } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
1760
            FILE *logfp;
A
antirez 已提交
1761 1762

            server.logfile = zstrdup(argv[1]);
1763
            if (!strcasecmp(server.logfile,"stdout")) {
A
antirez 已提交
1764 1765 1766 1767 1768 1769
                zfree(server.logfile);
                server.logfile = NULL;
            }
            if (server.logfile) {
                /* Test if we are able to open the file. The server will not
                 * be able to abort just for this problem later... */
1770 1771
                logfp = fopen(server.logfile,"a");
                if (logfp == NULL) {
A
antirez 已提交
1772 1773 1774 1775
                    err = sdscatprintf(sdsempty(),
                        "Can't open the log file: %s", strerror(errno));
                    goto loaderr;
                }
1776
                fclose(logfp);
A
antirez 已提交
1777
            }
1778
        } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
A
antirez 已提交
1779 1780 1781 1782
            server.dbnum = atoi(argv[1]);
            if (server.dbnum < 1) {
                err = "Invalid number of databases"; goto loaderr;
            }
1783 1784
        } else if (!strcasecmp(argv[0],"include") && argc == 2) {
            loadServerConfig(argv[1]);
1785 1786
        } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
            server.maxclients = atoi(argv[1]);
A
antirez 已提交
1787
        } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
1788
            server.maxmemory = strtoll(argv[1], NULL, 10);
1789
        } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
A
antirez 已提交
1790 1791 1792
            server.masterhost = sdsnew(argv[1]);
            server.masterport = atoi(argv[2]);
            server.replstate = REDIS_REPL_CONNECT;
1793 1794
        } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
        	server.masterauth = zstrdup(argv[1]);
1795
        } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
A
antirez 已提交
1796
            if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
A
antirez 已提交
1797 1798
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1799
        } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
A
antirez 已提交
1800
            if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
1801 1802
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1803 1804 1805 1806
        } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
            if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1807
        } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
A
antirez 已提交
1808
            if ((server.daemonize = yesnotoi(argv[1])) == -1) {
A
antirez 已提交
1809 1810
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1811 1812 1813 1814
        } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
            if ((server.appendonly = yesnotoi(argv[1])) == -1) {
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1815
        } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1816
            if (!strcasecmp(argv[1],"no")) {
1817
                server.appendfsync = APPENDFSYNC_NO;
1818
            } else if (!strcasecmp(argv[1],"always")) {
1819
                server.appendfsync = APPENDFSYNC_ALWAYS;
1820
            } else if (!strcasecmp(argv[1],"everysec")) {
1821 1822 1823 1824 1825
                server.appendfsync = APPENDFSYNC_EVERYSEC;
            } else {
                err = "argument must be 'no', 'always' or 'everysec'";
                goto loaderr;
            }
1826
        } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
1827
            server.requirepass = zstrdup(argv[1]);
1828
        } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
1829
            zfree(server.pidfile);
1830
            server.pidfile = zstrdup(argv[1]);
1831
        } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
1832
            zfree(server.dbfilename);
1833
            server.dbfilename = zstrdup(argv[1]);
1834 1835 1836 1837
        } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
            if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
                err = "argument must be 'yes' or 'no'"; goto loaderr;
            }
1838
        } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
1839
            zfree(server.vm_swap_file);
1840
            server.vm_swap_file = zstrdup(argv[1]);
A
antirez 已提交
1841 1842 1843 1844 1845 1846
        } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
            server.vm_max_memory = strtoll(argv[1], NULL, 10);
        } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
            server.vm_page_size = strtoll(argv[1], NULL, 10);
        } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
            server.vm_pages = strtoll(argv[1], NULL, 10);
1847 1848
        } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
            server.vm_max_threads = strtoll(argv[1], NULL, 10);
1849 1850 1851 1852 1853 1854
        } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
            server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
        } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
            server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
        } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
            server.vm_max_threads = strtoll(argv[1], NULL, 10);
A
antirez 已提交
1855 1856 1857 1858 1859 1860 1861 1862
        } else {
            err = "Bad directive or wrong number of arguments"; goto loaderr;
        }
        for (j = 0; j < argc; j++)
            sdsfree(argv[j]);
        zfree(argv);
        sdsfree(line);
    }
1863
    if (fp != stdin) fclose(fp);
A
antirez 已提交
1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878
    return;

loaderr:
    fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
    fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
    fprintf(stderr, ">>> '%s'\n", line);
    fprintf(stderr, "%s\n", err);
    exit(1);
}

static void freeClientArgv(redisClient *c) {
    int j;

    for (j = 0; j < c->argc; j++)
        decrRefCount(c->argv[j]);
1879 1880
    for (j = 0; j < c->mbargc; j++)
        decrRefCount(c->mbargv[j]);
A
antirez 已提交
1881
    c->argc = 0;
1882
    c->mbargc = 0;
A
antirez 已提交
1883 1884 1885 1886 1887
}

static void freeClient(redisClient *c) {
    listNode *ln;

A
antirez 已提交
1888
    /* Note that if the client we are freeing is blocked into a blocking
1889 1890 1891 1892
     * call, we have to set querybuf to NULL *before* to call
     * unblockClientWaitingData() to avoid processInputBuffer() will get
     * called. Also it is important to remove the file events after
     * this, because this call adds the READABLE event. */
A
antirez 已提交
1893 1894 1895
    sdsfree(c->querybuf);
    c->querybuf = NULL;
    if (c->flags & REDIS_BLOCKED)
1896
        unblockClientWaitingData(c);
A
antirez 已提交
1897

A
antirez 已提交
1898 1899 1900 1901 1902
    /* Unsubscribe from all the pubsub channels */
    pubsubUnsubscribeAllChannels(c,0);
    pubsubUnsubscribeAllPatterns(c,0);
    dictRelease(c->pubsub_channels);
    listRelease(c->pubsub_patterns);
A
antirez 已提交
1903
    /* Obvious cleanup */
A
antirez 已提交
1904 1905 1906 1907 1908
    aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
    aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
    listRelease(c->reply);
    freeClientArgv(c);
    close(c->fd);
1909
    /* Remove from the list of clients */
A
antirez 已提交
1910
    ln = listSearchKey(server.clients,c);
1911
    redisAssert(ln != NULL);
A
antirez 已提交
1912
    listDelNode(server.clients,ln);
A
antirez 已提交
1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923
    /* Remove from the list of clients waiting for swapped keys */
    if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
        ln = listSearchKey(server.io_ready_clients,c);
        if (ln) {
            listDelNode(server.io_ready_clients,ln);
            server.vm_blocked_clients--;
        }
    }
    while (server.vm_enabled && listLength(c->io_keys)) {
        ln = listFirst(c->io_keys);
        dontWaitForSwappedKey(c,ln->value);
1924
    }
1925
    listRelease(c->io_keys);
A
antirez 已提交
1926
    /* Master/slave cleanup */
A
antirez 已提交
1927
    if (c->flags & REDIS_SLAVE) {
1928 1929
        if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
            close(c->repldbfd);
A
antirez 已提交
1930 1931
        list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
        ln = listSearchKey(l,c);
1932
        redisAssert(ln != NULL);
A
antirez 已提交
1933
        listDelNode(l,ln);
A
antirez 已提交
1934 1935 1936 1937 1938
    }
    if (c->flags & REDIS_MASTER) {
        server.master = NULL;
        server.replstate = REDIS_REPL_CONNECT;
    }
A
antirez 已提交
1939
    /* Release memory */
1940
    zfree(c->argv);
1941
    zfree(c->mbargv);
A
antirez 已提交
1942
    freeClientMultiState(c);
A
antirez 已提交
1943 1944 1945
    zfree(c);
}

1946
#define GLUEREPLY_UP_TO (1024)
A
antirez 已提交
1947
static void glueReplyBuffersIfNeeded(redisClient *c) {
1948 1949
    int copylen = 0;
    char buf[GLUEREPLY_UP_TO];
1950
    listNode *ln;
A
antirez 已提交
1951
    listIter li;
A
antirez 已提交
1952 1953
    robj *o;

A
antirez 已提交
1954 1955
    listRewind(c->reply,&li);
    while((ln = listNext(&li))) {
1956 1957
        int objlen;

A
antirez 已提交
1958
        o = ln->value;
1959 1960 1961 1962
        objlen = sdslen(o->ptr);
        if (copylen + objlen <= GLUEREPLY_UP_TO) {
            memcpy(buf+copylen,o->ptr,objlen);
            copylen += objlen;
A
antirez 已提交
1963
            listDelNode(c->reply,ln);
1964 1965 1966
        } else {
            if (copylen == 0) return;
            break;
A
antirez 已提交
1967 1968
        }
    }
1969 1970 1971
    /* Now the output buffer is empty, add the new single element */
    o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
    listAddNodeHead(c->reply,o);
A
antirez 已提交
1972 1973 1974 1975 1976 1977 1978 1979 1980
}

static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
    redisClient *c = privdata;
    int nwritten = 0, totwritten = 0, objlen;
    robj *o;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);

1981
    /* Use writev() if we have enough buffers to send */
1982
    if (!server.glueoutputbuf &&
A
Alex McHale 已提交
1983
        listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1984
        !(c->flags & REDIS_MASTER))
1985 1986 1987 1988 1989
    {
        sendReplyToClientWritev(el, fd, privdata, mask);
        return;
    }

A
antirez 已提交
1990
    while(listLength(c->reply)) {
1991 1992 1993
        if (server.glueoutputbuf && listLength(c->reply) > 1)
            glueReplyBuffersIfNeeded(c);

A
antirez 已提交
1994 1995 1996 1997 1998 1999 2000 2001 2002
        o = listNodeValue(listFirst(c->reply));
        objlen = sdslen(o->ptr);

        if (objlen == 0) {
            listDelNode(c->reply,listFirst(c->reply));
            continue;
        }

        if (c->flags & REDIS_MASTER) {
2003
            /* Don't reply to a master */
A
antirez 已提交
2004 2005
            nwritten = objlen - c->sentlen;
        } else {
A
antirez 已提交
2006
            nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
A
antirez 已提交
2007 2008 2009 2010 2011 2012 2013 2014 2015
            if (nwritten <= 0) break;
        }
        c->sentlen += nwritten;
        totwritten += nwritten;
        /* If we fully sent the object on head go to the next one */
        if (c->sentlen == objlen) {
            listDelNode(c->reply,listFirst(c->reply));
            c->sentlen = 0;
        }
2016
        /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
A
antirez 已提交
2017
         * bytes, in a single threaded server it's a good idea to serve
2018 2019
         * other clients as well, even if a very large request comes from
         * super fast link that is always able to accept data (in real world
A
antirez 已提交
2020
         * scenario think about 'KEYS *' against the loopback interfae) */
2021
        if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
A
antirez 已提交
2022 2023 2024 2025 2026
    }
    if (nwritten == -1) {
        if (errno == EAGAIN) {
            nwritten = 0;
        } else {
2027
            redisLog(REDIS_VERBOSE,
A
antirez 已提交
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039
                "Error writing to client: %s", strerror(errno));
            freeClient(c);
            return;
        }
    }
    if (totwritten > 0) c->lastinteraction = time(NULL);
    if (listLength(c->reply) == 0) {
        c->sentlen = 0;
        aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
    }
}

2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
{
    redisClient *c = privdata;
    int nwritten = 0, totwritten = 0, objlen, willwrite;
    robj *o;
    struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
    int offset, ion = 0;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);

    listNode *node;
    while (listLength(c->reply)) {
        offset = c->sentlen;
        ion = 0;
        willwrite = 0;

        /* fill-in the iov[] array */
        for(node = listFirst(c->reply); node; node = listNextNode(node)) {
            o = listNodeValue(node);
            objlen = sdslen(o->ptr);

A
Alex McHale 已提交
2061
            if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
                break;

            if(ion == REDIS_WRITEV_IOVEC_COUNT)
                break; /* no more iovecs */

            iov[ion].iov_base = ((char*)o->ptr) + offset;
            iov[ion].iov_len = objlen - offset;
            willwrite += objlen - offset;
            offset = 0; /* just for the first item */
            ion++;
        }

        if(willwrite == 0)
            break;

        /* write all collected blocks at once */
        if((nwritten = writev(fd, iov, ion)) < 0) {
            if (errno != EAGAIN) {
2080
                redisLog(REDIS_VERBOSE,
2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
                         "Error writing to client: %s", strerror(errno));
                freeClient(c);
                return;
            }
            break;
        }

        totwritten += nwritten;
        offset = c->sentlen;

        /* remove written robjs from c->reply */
        while (nwritten && listLength(c->reply)) {
            o = listNodeValue(listFirst(c->reply));
            objlen = sdslen(o->ptr);

            if(nwritten >= objlen - offset) {
                listDelNode(c->reply, listFirst(c->reply));
                nwritten -= objlen - offset;
                c->sentlen = 0;
            } else {
                /* partial write */
                c->sentlen += nwritten;
                break;
            }
            offset = 0;
        }
    }

A
Alex McHale 已提交
2109
    if (totwritten > 0)
2110 2111 2112 2113 2114 2115 2116 2117
        c->lastinteraction = time(NULL);

    if (listLength(c->reply) == 0) {
        c->sentlen = 0;
        aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
    }
}

A
antirez 已提交
2118 2119 2120
static struct redisCommand *lookupCommand(char *name) {
    int j = 0;
    while(cmdTable[j].name != NULL) {
2121
        if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
A
antirez 已提交
2122 2123 2124 2125 2126 2127 2128 2129 2130
        j++;
    }
    return NULL;
}

/* resetClient prepare the client to process the next command */
static void resetClient(redisClient *c) {
    freeClientArgv(c);
    c->bulklen = -1;
2131
    c->multibulk = 0;
A
antirez 已提交
2132 2133
}

A
antirez 已提交
2134 2135 2136 2137 2138 2139
/* Call() is the core of Redis execution of a command */
static void call(redisClient *c, struct redisCommand *cmd) {
    long long dirty;

    dirty = server.dirty;
    cmd->proc(c);
2140 2141 2142
    dirty = server.dirty-dirty;

    if (server.appendonly && dirty)
A
antirez 已提交
2143
        feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
2144 2145
    if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
        listLength(server.slaves))
2146
        replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
A
antirez 已提交
2147
    if (listLength(server.monitors))
2148
        replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
A
antirez 已提交
2149 2150 2151
    server.stat_numcommands++;
}

A
antirez 已提交
2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162
/* If this function gets called we already read a whole
 * command, argments are in the client argv/argc fields.
 * processCommand() execute the command or prepare the
 * server for a bulk read from the client.
 *
 * If 1 is returned the client is still alive and valid and
 * and other operations can be performed by the caller. Otherwise
 * if 0 is returned the client was destroied (i.e. after QUIT). */
static int processCommand(redisClient *c) {
    struct redisCommand *cmd;

A
antirez 已提交
2163 2164 2165
    /* Free some memory if needed (maxmemory setting) */
    if (server.maxmemory) freeMemoryIfNeeded();

2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233
    /* Handle the multi bulk command type. This is an alternative protocol
     * supported by Redis in order to receive commands that are composed of
     * multiple binary-safe "bulk" arguments. The latency of processing is
     * a bit higher but this allows things like multi-sets, so if this
     * protocol is used only for MSET and similar commands this is a big win. */
    if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
        c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
        if (c->multibulk <= 0) {
            resetClient(c);
            return 1;
        } else {
            decrRefCount(c->argv[c->argc-1]);
            c->argc--;
            return 1;
        }
    } else if (c->multibulk) {
        if (c->bulklen == -1) {
            if (((char*)c->argv[0]->ptr)[0] != '$') {
                addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
                resetClient(c);
                return 1;
            } else {
                int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
                decrRefCount(c->argv[0]);
                if (bulklen < 0 || bulklen > 1024*1024*1024) {
                    c->argc--;
                    addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
                    resetClient(c);
                    return 1;
                }
                c->argc--;
                c->bulklen = bulklen+2; /* add two bytes for CR+LF */
                return 1;
            }
        } else {
            c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
            c->mbargv[c->mbargc] = c->argv[0];
            c->mbargc++;
            c->argc--;
            c->multibulk--;
            if (c->multibulk == 0) {
                robj **auxargv;
                int auxargc;

                /* Here we need to swap the multi-bulk argc/argv with the
                 * normal argc/argv of the client structure. */
                auxargv = c->argv;
                c->argv = c->mbargv;
                c->mbargv = auxargv;

                auxargc = c->argc;
                c->argc = c->mbargc;
                c->mbargc = auxargc;

                /* We need to set bulklen to something different than -1
                 * in order for the code below to process the command without
                 * to try to read the last argument of a bulk command as
                 * a special argument. */
                c->bulklen = 0;
                /* continue below and process the command */
            } else {
                c->bulklen = -1;
                return 1;
            }
        }
    }
    /* -- end of multi bulk commands processing -- */

A
antirez 已提交
2234 2235
    /* The QUIT command is handled as a special case. Normal command
     * procs are unable to close the client connection safely */
2236
    if (!strcasecmp(c->argv[0]->ptr,"quit")) {
A
antirez 已提交
2237 2238 2239
        freeClient(c);
        return 0;
    }
A
antirez 已提交
2240 2241 2242

    /* Now lookup the command and check ASAP about trivial error conditions
     * such wrong arity, bad command name and so forth. */
A
antirez 已提交
2243 2244
    cmd = lookupCommand(c->argv[0]->ptr);
    if (!cmd) {
2245 2246 2247
        addReplySds(c,
            sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
                (char*)c->argv[0]->ptr));
A
antirez 已提交
2248 2249 2250 2251
        resetClient(c);
        return 1;
    } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
               (c->argc < -cmd->arity)) {
2252 2253 2254 2255
        addReplySds(c,
            sdscatprintf(sdsempty(),
                "-ERR wrong number of arguments for '%s' command\r\n",
                cmd->name));
A
antirez 已提交
2256 2257 2258
        resetClient(c);
        return 1;
    } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
A
antirez 已提交
2259
        /* This is a bulk command, we have to read the last argument yet. */
A
antirez 已提交
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271
        int bulklen = atoi(c->argv[c->argc-1]->ptr);

        decrRefCount(c->argv[c->argc-1]);
        if (bulklen < 0 || bulklen > 1024*1024*1024) {
            c->argc--;
            addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
            resetClient(c);
            return 1;
        }
        c->argc--;
        c->bulklen = bulklen+2; /* add two bytes for CR+LF */
        /* It is possible that the bulk read is already in the
A
antirez 已提交
2272 2273 2274 2275
         * buffer. Check this condition and handle it accordingly.
         * This is just a fast path, alternative to call processInputBuffer().
         * It's a good idea since the code is small and this condition
         * happens most of the times. */
A
antirez 已提交
2276 2277 2278 2279 2280
        if ((signed)sdslen(c->querybuf) >= c->bulklen) {
            c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
            c->argc++;
            c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
        } else {
A
antirez 已提交
2281 2282
            /* Otherwise return... there is to read the last argument
             * from the socket. */
A
antirez 已提交
2283 2284 2285
            return 1;
        }
    }
2286 2287
    /* Let's try to encode the bulk object to save space. */
    if (cmd->flags & REDIS_CMD_BULK)
2288
        c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
2289

2290 2291 2292 2293 2294 2295 2296
    /* Check if the user is authenticated */
    if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
        addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
        resetClient(c);
        return 1;
    }

2297 2298 2299 2300 2301 2302 2303 2304 2305
    /* Handle the maxmemory directive */
    if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
        zmalloc_used_memory() > server.maxmemory)
    {
        addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
        resetClient(c);
        return 1;
    }

2306
    /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
A
antirez 已提交
2307 2308 2309 2310
    if (dictSize(c->pubsub_channels) > 0 &&
        cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
        cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
        addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
2311 2312 2313 2314
        resetClient(c);
        return 1;
    }

A
antirez 已提交
2315
    /* Exec the command */
2316
    if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
A
antirez 已提交
2317 2318 2319
        queueMultiCommand(c,cmd);
        addReply(c,shared.queued);
    } else {
A
antirez 已提交
2320 2321
        if (server.vm_enabled && server.vm_max_threads > 0 &&
            blockClientOnSwappedKeys(cmd,c)) return 1;
A
antirez 已提交
2322 2323
        call(c,cmd);
    }
A
antirez 已提交
2324 2325 2326 2327 2328 2329

    /* Prepare the client for the next command */
    resetClient(c);
    return 1;
}

2330
static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
2331
    listNode *ln;
A
antirez 已提交
2332
    listIter li;
A
antirez 已提交
2333
    int outc = 0, j;
2334
    robj **outv;
2335 2336 2337 2338 2339
    /* We need 1+(ARGS*3) objects since commands are using the new protocol
     * and we one 1 object for the first "*<count>\r\n" multibulk count, then
     * for every additional object we have "$<count>\r\n" + object + "\r\n". */
    robj *static_outv[REDIS_STATIC_ARGS*3+1];
    robj *lenobj;
2340 2341 2342 2343

    if (argc <= REDIS_STATIC_ARGS) {
        outv = static_outv;
    } else {
2344
        outv = zmalloc(sizeof(robj*)*(argc*3+1));
2345
    }
2346 2347 2348 2349 2350

    lenobj = createObject(REDIS_STRING,
            sdscatprintf(sdsempty(), "*%d\r\n", argc));
    lenobj->refcount = 0;
    outv[outc++] = lenobj;
A
antirez 已提交
2351
    for (j = 0; j < argc; j++) {
2352 2353 2354 2355 2356
        lenobj = createObject(REDIS_STRING,
            sdscatprintf(sdsempty(),"$%lu\r\n",
                (unsigned long) stringObjectLen(argv[j])));
        lenobj->refcount = 0;
        outv[outc++] = lenobj;
A
antirez 已提交
2357
        outv[outc++] = argv[j];
2358
        outv[outc++] = shared.crlf;
A
antirez 已提交
2359 2360
    }

2361 2362 2363 2364
    /* Increment all the refcounts at start and decrement at end in order to
     * be sure to free objects if there is no slave in a replication state
     * able to be feed with commands */
    for (j = 0; j < outc; j++) incrRefCount(outv[j]);
A
antirez 已提交
2365 2366
    listRewind(slaves,&li);
    while((ln = listNext(&li))) {
A
antirez 已提交
2367
        redisClient *slave = ln->value;
2368 2369

        /* Don't feed slaves that are still waiting for BGSAVE to start */
2370
        if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
2371 2372

        /* Feed all the other slaves, MONITORs and so on */
A
antirez 已提交
2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397
        if (slave->slaveseldb != dictid) {
            robj *selectcmd;

            switch(dictid) {
            case 0: selectcmd = shared.select0; break;
            case 1: selectcmd = shared.select1; break;
            case 2: selectcmd = shared.select2; break;
            case 3: selectcmd = shared.select3; break;
            case 4: selectcmd = shared.select4; break;
            case 5: selectcmd = shared.select5; break;
            case 6: selectcmd = shared.select6; break;
            case 7: selectcmd = shared.select7; break;
            case 8: selectcmd = shared.select8; break;
            case 9: selectcmd = shared.select9; break;
            default:
                selectcmd = createObject(REDIS_STRING,
                    sdscatprintf(sdsempty(),"select %d\r\n",dictid));
                selectcmd->refcount = 0;
                break;
            }
            addReply(slave,selectcmd);
            slave->slaveseldb = dictid;
        }
        for (j = 0; j < outc; j++) addReply(slave,outv[j]);
    }
2398
    for (j = 0; j < outc; j++) decrRefCount(outv[j]);
2399
    if (outv != static_outv) zfree(outv);
A
antirez 已提交
2400 2401
}

A
antirez 已提交
2402
static void processInputBuffer(redisClient *c) {
A
antirez 已提交
2403
again:
A
antirez 已提交
2404 2405 2406 2407 2408 2409
    /* Before to process the input buffer, make sure the client is not
     * waitig for a blocking operation such as BLPOP. Note that the first
     * iteration the client is never blocked, otherwise the processInputBuffer
     * would not be called at all, but after the execution of the first commands
     * in the input buffer the client may be blocked, and the "goto again"
     * will try to reiterate. The following line will make it return asap. */
2410
    if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
A
antirez 已提交
2411 2412 2413 2414
    if (c->bulklen == -1) {
        /* Read the first line of the query */
        char *p = strchr(c->querybuf,'\n');
        size_t querylen;
2415

A
antirez 已提交
2416 2417 2418
        if (p) {
            sds query, *argv;
            int argc, j;
A
Alex McHale 已提交
2419

A
antirez 已提交
2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432
            query = c->querybuf;
            c->querybuf = sdsempty();
            querylen = 1+(p-(query));
            if (sdslen(query) > querylen) {
                /* leave data after the first line of the query in the buffer */
                c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
            }
            *p = '\0'; /* remove "\n" */
            if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
            sdsupdatelen(query);

            /* Now we can split the query in arguments */
            argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
2433 2434 2435 2436 2437 2438
            sdsfree(query);

            if (c->argv) zfree(c->argv);
            c->argv = zmalloc(sizeof(robj*)*argc);

            for (j = 0; j < argc; j++) {
A
antirez 已提交
2439 2440 2441 2442 2443 2444 2445 2446
                if (sdslen(argv[j])) {
                    c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
                    c->argc++;
                } else {
                    sdsfree(argv[j]);
                }
            }
            zfree(argv);
2447 2448 2449 2450 2451 2452 2453 2454 2455 2456
            if (c->argc) {
                /* Execute the command. If the client is still valid
                 * after processCommand() return and there is something
                 * on the query buffer try to process the next command. */
                if (processCommand(c) && sdslen(c->querybuf)) goto again;
            } else {
                /* Nothing to process, argc == 0. Just process the query
                 * buffer if it's not empty or return to the caller */
                if (sdslen(c->querybuf)) goto again;
            }
A
antirez 已提交
2457
            return;
2458
        } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
2459
            redisLog(REDIS_VERBOSE, "Client protocol error");
A
antirez 已提交
2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474
            freeClient(c);
            return;
        }
    } else {
        /* Bulk read handling. Note that if we are at this point
           the client already sent a command terminated with a newline,
           we are reading the bulk data that is actually the last
           argument of the command. */
        int qbl = sdslen(c->querybuf);

        if (c->bulklen <= qbl) {
            /* Copy everything but the final CRLF as final argument */
            c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
            c->argc++;
            c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
A
antirez 已提交
2475 2476 2477 2478
            /* Process the command. If the client is still valid after
             * the processing and there is more data in the buffer
             * try to parse it. */
            if (processCommand(c) && sdslen(c->querybuf)) goto again;
A
antirez 已提交
2479 2480 2481 2482 2483
            return;
        }
    }
}

A
antirez 已提交
2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
    redisClient *c = (redisClient*) privdata;
    char buf[REDIS_IOBUF_LEN];
    int nread;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);

    nread = read(fd, buf, REDIS_IOBUF_LEN);
    if (nread == -1) {
        if (errno == EAGAIN) {
            nread = 0;
        } else {
2496
            redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
A
antirez 已提交
2497 2498 2499 2500
            freeClient(c);
            return;
        }
    } else if (nread == 0) {
2501
        redisLog(REDIS_VERBOSE, "Client closed connection");
A
antirez 已提交
2502 2503 2504 2505 2506 2507 2508 2509 2510
        freeClient(c);
        return;
    }
    if (nread) {
        c->querybuf = sdscatlen(c->querybuf, buf, nread);
        c->lastinteraction = time(NULL);
    } else {
        return;
    }
2511
    processInputBuffer(c);
A
antirez 已提交
2512 2513
}

A
antirez 已提交
2514 2515 2516
static int selectDb(redisClient *c, int id) {
    if (id < 0 || id >= server.dbnum)
        return REDIS_ERR;
A
antirez 已提交
2517
    c->db = &server.db[id];
A
antirez 已提交
2518 2519 2520
    return REDIS_OK;
}

2521 2522
static void *dupClientReplyValue(void *o) {
    incrRefCount((robj*)o);
2523
    return o;
2524 2525
}

A
antirez 已提交
2526 2527 2528 2529
static int listMatchObjects(void *a, void *b) {
    return compareStringObjects(a,b) == 0;
}

A
antirez 已提交
2530 2531 2532 2533 2534 2535 2536 2537 2538 2539
static redisClient *createClient(int fd) {
    redisClient *c = zmalloc(sizeof(*c));

    anetNonBlock(NULL,fd);
    anetTcpNoDelay(NULL,fd);
    if (!c) return NULL;
    selectDb(c,0);
    c->fd = fd;
    c->querybuf = sdsempty();
    c->argc = 0;
2540
    c->argv = NULL;
A
antirez 已提交
2541
    c->bulklen = -1;
2542 2543 2544
    c->multibulk = 0;
    c->mbargc = 0;
    c->mbargv = NULL;
A
antirez 已提交
2545 2546 2547
    c->sentlen = 0;
    c->flags = 0;
    c->lastinteraction = time(NULL);
B
Brian Hammond 已提交
2548
    c->authenticated = 0;
2549
    c->replstate = REDIS_REPL_NONE;
2550
    c->reply = listCreate();
A
antirez 已提交
2551
    listSetFreeMethod(c->reply,decrRefCount);
2552
    listSetDupMethod(c->reply,dupClientReplyValue);
2553 2554 2555 2556
    c->blockingkeys = NULL;
    c->blockingkeysnum = 0;
    c->io_keys = listCreate();
    listSetFreeMethod(c->io_keys,decrRefCount);
A
antirez 已提交
2557 2558 2559 2560
    c->pubsub_channels = dictCreate(&setDictType,NULL);
    c->pubsub_patterns = listCreate();
    listSetFreeMethod(c->pubsub_patterns,decrRefCount);
    listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
A
antirez 已提交
2561
    if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
2562
        readQueryFromClient, c) == AE_ERR) {
A
antirez 已提交
2563 2564 2565
        freeClient(c);
        return NULL;
    }
2566
    listAddNodeTail(server.clients,c);
A
antirez 已提交
2567
    initClientMultiState(c);
A
antirez 已提交
2568 2569 2570 2571 2572
    return c;
}

static void addReply(redisClient *c, robj *obj) {
    if (listLength(c->reply) == 0 &&
2573 2574
        (c->replstate == REDIS_REPL_NONE ||
         c->replstate == REDIS_REPL_ONLINE) &&
A
antirez 已提交
2575
        aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
2576
        sendReplyToClient, c) == AE_ERR) return;
2577 2578 2579 2580 2581

    if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
        obj = dupStringObject(obj);
        obj->refcount = 0; /* getDecodedObject() will increment the refcount */
    }
2582
    listAddNodeTail(c->reply,getDecodedObject(obj));
A
antirez 已提交
2583 2584 2585 2586 2587 2588 2589 2590
}

static void addReplySds(redisClient *c, sds s) {
    robj *o = createObject(REDIS_STRING,s);
    addReply(c,o);
    decrRefCount(o);
}

A
antirez 已提交
2591 2592 2593 2594
static void addReplyDouble(redisClient *c, double d) {
    char buf[128];

    snprintf(buf,sizeof(buf),"%.17g",d);
2595
    addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
A
antirez 已提交
2596
        (unsigned long) strlen(buf),buf));
A
antirez 已提交
2597 2598
}

2599 2600 2601 2602
static void addReplyLong(redisClient *c, long l) {
    char buf[128];
    size_t len;

A
antirez 已提交
2603 2604 2605 2606 2607 2608 2609
    if (l == 0) {
        addReply(c,shared.czero);
        return;
    } else if (l == 1) {
        addReply(c,shared.cone);
        return;
    }
2610 2611 2612 2613
    len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
    addReplySds(c,sdsnewlen(buf,len));
}

2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628
static void addReplyLongLong(redisClient *c, long long ll) {
    char buf[128];
    size_t len;

    if (ll == 0) {
        addReply(c,shared.czero);
        return;
    } else if (ll == 1) {
        addReply(c,shared.cone);
        return;
    }
    len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
    addReplySds(c,sdsnewlen(buf,len));
}

2629 2630 2631 2632
static void addReplyUlong(redisClient *c, unsigned long ul) {
    char buf[128];
    size_t len;

A
antirez 已提交
2633 2634 2635 2636 2637 2638 2639
    if (ul == 0) {
        addReply(c,shared.czero);
        return;
    } else if (ul == 1) {
        addReply(c,shared.cone);
        return;
    }
2640 2641 2642 2643
    len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
    addReplySds(c,sdsnewlen(buf,len));
}

2644 2645 2646 2647 2648 2649 2650 2651
static void addReplyBulkLen(redisClient *c, robj *obj) {
    size_t len;

    if (obj->encoding == REDIS_ENCODING_RAW) {
        len = sdslen(obj->ptr);
    } else {
        long n = (long)obj->ptr;

2652
        /* Compute how many bytes will take this integer as a radix 10 string */
2653 2654 2655 2656 2657 2658 2659 2660 2661
        len = 1;
        if (n < 0) {
            len++;
            n = -n;
        }
        while((n = n/10) != 0) {
            len++;
        }
    }
A
antirez 已提交
2662
    addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
2663 2664
}

A
antirez 已提交
2665 2666 2667 2668 2669 2670
static void addReplyBulk(redisClient *c, robj *obj) {
    addReplyBulkLen(c,obj);
    addReply(c,obj);
    addReply(c,shared.crlf);
}

2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681
/* In the CONFIG command we need to add vanilla C string as bulk replies */
static void addReplyBulkCString(redisClient *c, char *s) {
    if (s == NULL) {
        addReply(c,shared.nullbulk);
    } else {
        robj *o = createStringObject(s,strlen(s));
        addReplyBulk(c,o);
        decrRefCount(o);
    }
}

A
antirez 已提交
2682 2683 2684
static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
    int cport, cfd;
    char cip[128];
2685
    redisClient *c;
A
antirez 已提交
2686 2687 2688 2689 2690 2691
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);
    REDIS_NOTUSED(privdata);

    cfd = anetAccept(server.neterr, fd, cip, &cport);
    if (cfd == AE_ERR) {
2692
        redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
A
antirez 已提交
2693 2694
        return;
    }
2695
    redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
2696
    if ((c = createClient(cfd)) == NULL) {
A
antirez 已提交
2697 2698 2699 2700
        redisLog(REDIS_WARNING,"Error allocating resoures for the client");
        close(cfd); /* May be already closed, just ingore errors */
        return;
    }
2701 2702 2703 2704 2705 2706 2707 2708
    /* If maxclient directive is set and this is one client more... close the
     * connection. Note that we create the client instead to check before
     * for this condition, since now the socket is already set in nonblocking
     * mode and we can send an error for free using the Kernel I/O */
    if (server.maxclients && listLength(server.clients) > server.maxclients) {
        char *err = "-ERR max number of clients reached\r\n";

        /* That's a best effort error message, don't check write errors */
2709 2710 2711
        if (write(c->fd,err,strlen(err)) == -1) {
            /* Nothing to do, Just to avoid the warning... */
        }
2712 2713 2714
        freeClient(c);
        return;
    }
A
antirez 已提交
2715 2716 2717 2718 2719 2720 2721 2722
    server.stat_numconnections++;
}

/* ======================= Redis objects implementation ===================== */

static robj *createObject(int type, void *ptr) {
    robj *o;

2723
    if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
A
antirez 已提交
2724 2725 2726 2727
    if (listLength(server.objfreelist)) {
        listNode *head = listFirst(server.objfreelist);
        o = listNodeValue(head);
        listDelNode(server.objfreelist,head);
2728
        if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
A
antirez 已提交
2729
    } else {
2730
        if (server.vm_enabled) {
2731
            pthread_mutex_unlock(&server.obj_freelist_mutex);
2732 2733 2734 2735
            o = zmalloc(sizeof(*o));
        } else {
            o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
        }
A
antirez 已提交
2736 2737
    }
    o->type = type;
2738
    o->encoding = REDIS_ENCODING_RAW;
A
antirez 已提交
2739 2740
    o->ptr = ptr;
    o->refcount = 1;
A
antirez 已提交
2741
    if (server.vm_enabled) {
A
antirez 已提交
2742 2743 2744 2745
        /* Note that this code may run in the context of an I/O thread
         * and accessing to server.unixtime in theory is an error
         * (no locks). But in practice this is safe, and even if we read
         * garbage Redis will not fail, as it's just a statistical info */
A
antirez 已提交
2746 2747 2748
        o->vm.atime = server.unixtime;
        o->storage = REDIS_VM_MEMORY;
    }
A
antirez 已提交
2749 2750 2751 2752 2753 2754 2755
    return o;
}

static robj *createStringObject(char *ptr, size_t len) {
    return createObject(REDIS_STRING,sdsnewlen(ptr,len));
}

A
antirez 已提交
2756
static robj *dupStringObject(robj *o) {
2757
    assert(o->encoding == REDIS_ENCODING_RAW);
A
antirez 已提交
2758 2759 2760
    return createStringObject(o->ptr,sdslen(o->ptr));
}

A
antirez 已提交
2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772
static robj *createListObject(void) {
    list *l = listCreate();

    listSetFreeMethod(l,decrRefCount);
    return createObject(REDIS_LIST,l);
}

static robj *createSetObject(void) {
    dict *d = dictCreate(&setDictType,NULL);
    return createObject(REDIS_SET,d);
}

2773 2774 2775 2776 2777 2778 2779 2780 2781 2782
static robj *createHashObject(void) {
    /* All the Hashes start as zipmaps. Will be automatically converted
     * into hash tables if there are enough elements or big elements
     * inside. */
    unsigned char *zm = zipmapNew();
    robj *o = createObject(REDIS_HASH,zm);
    o->encoding = REDIS_ENCODING_ZIPMAP;
    return o;
}

2783
static robj *createZsetObject(void) {
2784 2785 2786 2787 2788
    zset *zs = zmalloc(sizeof(*zs));

    zs->dict = dictCreate(&zsetDictType,NULL);
    zs->zsl = zslCreate();
    return createObject(REDIS_ZSET,zs);
2789 2790
}

A
antirez 已提交
2791
static void freeStringObject(robj *o) {
2792 2793 2794
    if (o->encoding == REDIS_ENCODING_RAW) {
        sdsfree(o->ptr);
    }
A
antirez 已提交
2795 2796 2797 2798 2799 2800 2801 2802 2803 2804
}

static void freeListObject(robj *o) {
    listRelease((list*) o->ptr);
}

static void freeSetObject(robj *o) {
    dictRelease((dict*) o->ptr);
}

2805 2806 2807 2808 2809 2810 2811 2812
static void freeZsetObject(robj *o) {
    zset *zs = o->ptr;

    dictRelease(zs->dict);
    zslFree(zs->zsl);
    zfree(zs);
}

A
antirez 已提交
2813
static void freeHashObject(robj *o) {
2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
    switch (o->encoding) {
    case REDIS_ENCODING_HT:
        dictRelease((dict*) o->ptr);
        break;
    case REDIS_ENCODING_ZIPMAP:
        zfree(o->ptr);
        break;
    default:
        redisAssert(0);
        break;
    }
A
antirez 已提交
2825 2826 2827 2828 2829 2830 2831 2832
}

static void incrRefCount(robj *o) {
    o->refcount++;
}

static void decrRefCount(void *obj) {
    robj *o = obj;
2833

2834 2835
    /* Object is a key of a swapped out value, or in the process of being
     * loaded. */
2836 2837 2838 2839
    if (server.vm_enabled &&
        (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
    {
        if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
2840
        redisAssert(o->type == REDIS_STRING);
A
antirez 已提交
2841 2842
        freeStringObject(o);
        vmMarkPagesFree(o->vm.page,o->vm.usedpages);
2843
        pthread_mutex_lock(&server.obj_freelist_mutex);
A
antirez 已提交
2844 2845 2846
        if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
            !listAddNodeHead(server.objfreelist,o))
            zfree(o);
2847
        pthread_mutex_unlock(&server.obj_freelist_mutex);
A
antirez 已提交
2848
        server.vm_stats_swapped_objects--;
A
antirez 已提交
2849 2850
        return;
    }
2851
    /* Object is in memory, or in the process of being swapped out. */
A
antirez 已提交
2852
    if (--(o->refcount) == 0) {
2853 2854
        if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
            vmCancelThreadedIOJob(obj);
A
antirez 已提交
2855 2856 2857 2858
        switch(o->type) {
        case REDIS_STRING: freeStringObject(o); break;
        case REDIS_LIST: freeListObject(o); break;
        case REDIS_SET: freeSetObject(o); break;
2859
        case REDIS_ZSET: freeZsetObject(o); break;
A
antirez 已提交
2860
        case REDIS_HASH: freeHashObject(o); break;
A
antirez 已提交
2861
        default: redisAssert(0); break;
A
antirez 已提交
2862
        }
2863
        if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
A
antirez 已提交
2864 2865 2866
        if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
            !listAddNodeHead(server.objfreelist,o))
            zfree(o);
2867
        if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
A
antirez 已提交
2868 2869 2870
    }
}

2871 2872
static robj *lookupKey(redisDb *db, robj *key) {
    dictEntry *de = dictFind(db->dict,key);
A
antirez 已提交
2873
    if (de) {
A
antirez 已提交
2874 2875
        robj *key = dictGetEntryKey(de);
        robj *val = dictGetEntryVal(de);
A
antirez 已提交
2876

A
antirez 已提交
2877
        if (server.vm_enabled) {
2878 2879 2880 2881 2882 2883 2884
            if (key->storage == REDIS_VM_MEMORY ||
                key->storage == REDIS_VM_SWAPPING)
            {
                /* If we were swapping the object out, stop it, this key
                 * was requested. */
                if (key->storage == REDIS_VM_SWAPPING)
                    vmCancelThreadedIOJob(key);
A
antirez 已提交
2885 2886 2887
                /* Update the access time of the key for the aging algorithm. */
                key->vm.atime = server.unixtime;
            } else {
A
antirez 已提交
2888 2889
                int notify = (key->storage == REDIS_VM_LOADING);

A
antirez 已提交
2890
                /* Our value was swapped on disk. Bring it at home. */
2891
                redisAssert(val == NULL);
A
antirez 已提交
2892 2893
                val = vmLoadObject(key);
                dictGetEntryVal(de) = val;
A
antirez 已提交
2894 2895 2896 2897

                /* Clients blocked by the VM subsystem may be waiting for
                 * this key... */
                if (notify) handleClientsBlockedOnSwappedKey(db,key);
A
antirez 已提交
2898 2899 2900
            }
        }
        return val;
A
antirez 已提交
2901 2902 2903
    } else {
        return NULL;
    }
2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915
}

static robj *lookupKeyRead(redisDb *db, robj *key) {
    expireIfNeeded(db,key);
    return lookupKey(db,key);
}

static robj *lookupKeyWrite(redisDb *db, robj *key) {
    deleteIfVolatile(db,key);
    return lookupKey(db,key);
}

2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
    robj *o = lookupKeyRead(c->db, key);
    if (!o) addReply(c,reply);
    return o;
}

static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
    robj *o = lookupKeyWrite(c->db, key);
    if (!o) addReply(c,reply);
    return o;
}

static int checkType(redisClient *c, robj *o, int type) {
    if (o->type != type) {
        addReply(c,shared.wrongtypeerr);
        return 1;
    }
    return 0;
}

2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950
static int deleteKey(redisDb *db, robj *key) {
    int retval;

    /* We need to protect key from destruction: after the first dictDelete()
     * it may happen that 'key' is no longer valid if we don't increment
     * it's count. This may happen when we get the object reference directly
     * from the hash table with dictRandomKey() or dict iterators */
    incrRefCount(key);
    if (dictSize(db->expires)) dictDelete(db->expires,key);
    retval = dictDelete(db->dict,key);
    decrRefCount(key);

    return retval == DICT_OK;
}

2951 2952 2953 2954 2955 2956
/* Check if the nul-terminated string 's' can be represented by a long
 * (that is, is a number that fits into long without any other space or
 * character before or after the digits).
 *
 * If so, the function returns REDIS_OK and *longval is set to the value
 * of the number. Otherwise REDIS_ERR is returned */
2957
static int isStringRepresentableAsLong(sds s, long *longval) {
2958 2959 2960
    char buf[32], *endptr;
    long value;
    int slen;
A
Alex McHale 已提交
2961

2962 2963 2964 2965 2966 2967
    value = strtol(s, &endptr, 10);
    if (endptr[0] != '\0') return REDIS_ERR;
    slen = snprintf(buf,32,"%ld",value);

    /* If the number converted back into a string is not identical
     * then it's not possible to encode the string as integer */
2968
    if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
2969 2970 2971 2972
    if (longval) *longval = value;
    return REDIS_OK;
}

2973
/* Try to encode a string object in order to save space */
2974
static robj *tryObjectEncoding(robj *o) {
2975 2976
    long value;
    sds s = o->ptr;
A
antirez 已提交
2977

2978
    if (o->encoding != REDIS_ENCODING_RAW)
2979
        return o; /* Already encoded */
A
antirez 已提交
2980

2981
    /* It's not safe to encode shared objects: shared objects can be shared
2982 2983
     * everywhere in the "object space" of Redis. Encoded objects can only
     * appear as "values" (and not, for instance, as keys) */
2984
     if (o->refcount > 1) return o;
A
antirez 已提交
2985

2986
    /* Currently we try to encode only strings */
2987
    redisAssert(o->type == REDIS_STRING);
2988

2989
    /* Check if we can represent this string as a long integer */
2990
    if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
2991 2992

    /* Ok, this object can be encoded */
2993 2994 2995 2996 2997 2998 2999 3000 3001 3002
    if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
        decrRefCount(o);
        incrRefCount(shared.integers[value]);
        return shared.integers[value];
    } else {
        o->encoding = REDIS_ENCODING_INT;
        sdsfree(o->ptr);
        o->ptr = (void*) value;
        return o;
    }
3003 3004
}

3005 3006 3007
/* Get a decoded version of an encoded object (returned as a new object).
 * If the object is already raw-encoded just increment the ref count. */
static robj *getDecodedObject(robj *o) {
3008
    robj *dec;
A
Alex McHale 已提交
3009

3010 3011 3012 3013
    if (o->encoding == REDIS_ENCODING_RAW) {
        incrRefCount(o);
        return o;
    }
3014 3015 3016 3017 3018 3019 3020
    if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
        char buf[32];

        snprintf(buf,32,"%ld",(long)o->ptr);
        dec = createStringObject(buf,strlen(buf));
        return dec;
    } else {
3021
        redisAssert(1 != 1);
3022
    }
A
antirez 已提交
3023 3024
}

3025 3026 3027
/* Compare two string objects via strcmp() or alike.
 * Note that the objects may be integer-encoded. In such a case we
 * use snprintf() to get a string representation of the numbers on the stack
3028 3029 3030 3031 3032
 * and compare the strings, it's much faster than calling getDecodedObject().
 *
 * Important note: if objects are not integer encoded, but binary-safe strings,
 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
 * binary safe. */
3033
static int compareStringObjects(robj *a, robj *b) {
3034
    redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
3035 3036
    char bufa[128], bufb[128], *astr, *bstr;
    int bothsds = 1;
3037

3038
    if (a == b) return 0;
3039 3040 3041 3042
    if (a->encoding != REDIS_ENCODING_RAW) {
        snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
        astr = bufa;
        bothsds = 0;
3043
    } else {
3044
        astr = a->ptr;
3045
    }
3046 3047 3048 3049 3050 3051 3052 3053
    if (b->encoding != REDIS_ENCODING_RAW) {
        snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
        bstr = bufb;
        bothsds = 0;
    } else {
        bstr = b->ptr;
    }
    return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
3054 3055
}

3056
static size_t stringObjectLen(robj *o) {
3057
    redisAssert(o->type == REDIS_STRING);
3058 3059 3060 3061 3062 3063 3064 3065 3066
    if (o->encoding == REDIS_ENCODING_RAW) {
        return sdslen(o->ptr);
    } else {
        char buf[32];

        return snprintf(buf,32,"%ld",(long)o->ptr);
    }
}

A
Alex McHale 已提交
3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137
static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
    double parsedValue;
    char *eptr = NULL;

    if (o && o->type != REDIS_STRING) {
        addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
        return REDIS_ERR;
    }

    if (o == NULL)
        parsedValue = 0;
    else if (o->encoding == REDIS_ENCODING_RAW)
        parsedValue = strtod(o->ptr, &eptr);
    else if (o->encoding == REDIS_ENCODING_INT)
        parsedValue = (long)o->ptr;
    else
        redisAssert(1 != 1);

    if (eptr != NULL && *eptr != '\0') {
        addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
        return REDIS_ERR;
    }

    *value = parsedValue;

    return REDIS_OK;
}

static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
    long long parsedValue;
    char *eptr = NULL;

    if (o && o->type != REDIS_STRING) {
        addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
        return REDIS_ERR;
    }

    if (o == NULL)
        parsedValue = 0;
    else if (o->encoding == REDIS_ENCODING_RAW)
        parsedValue = strtoll(o->ptr, &eptr, 10);
    else if (o->encoding == REDIS_ENCODING_INT)
        parsedValue = (long)o->ptr;
    else
        redisAssert(1 != 1);

    if (eptr != NULL && *eptr != '\0') {
        addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
        return REDIS_ERR;
    }

    *value = parsedValue;

    return REDIS_OK;
}

static int getLongFromObject(redisClient *c, robj *o, long *value) {
    long long actualValue;

    if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;

    if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
        addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
        return REDIS_ERR;
    }

    *value = actualValue;

    return REDIS_OK;
}

3138
/*============================ RDB saving/loading =========================== */
A
antirez 已提交
3139

3140 3141 3142 3143 3144
static int rdbSaveType(FILE *fp, unsigned char type) {
    if (fwrite(&type,1,1,fp) == 0) return -1;
    return 0;
}

A
antirez 已提交
3145 3146 3147 3148 3149 3150
static int rdbSaveTime(FILE *fp, time_t t) {
    int32_t t32 = (int32_t) t;
    if (fwrite(&t32,4,1,fp) == 0) return -1;
    return 0;
}

3151
/* check rdbLoadLen() comments for more info */
3152 3153 3154 3155 3156
static int rdbSaveLen(FILE *fp, uint32_t len) {
    unsigned char buf[2];

    if (len < (1<<6)) {
        /* Save a 6 bit len */
3157
        buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
3158 3159 3160
        if (fwrite(buf,1,1,fp) == 0) return -1;
    } else if (len < (1<<14)) {
        /* Save a 14 bit len */
3161
        buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
3162
        buf[1] = len&0xFF;
3163
        if (fwrite(buf,2,1,fp) == 0) return -1;
3164 3165
    } else {
        /* Save a 32 bit len */
3166
        buf[0] = (REDIS_RDB_32BITLEN<<6);
3167 3168 3169 3170 3171 3172 3173
        if (fwrite(buf,1,1,fp) == 0) return -1;
        len = htonl(len);
        if (fwrite(&len,4,1,fp) == 0) return -1;
    }
    return 0;
}

3174 3175 3176
/* String objects in the form "2391" "-100" without any space and with a
 * range of values that can fit in an 8, 16 or 32 bit signed value can be
 * encoded as integers to save space */
A
antirez 已提交
3177
static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3178 3179 3180 3181 3182 3183 3184 3185 3186 3187
    long long value;
    char *endptr, buf[32];

    /* Check if it's possible to encode this value as a number */
    value = strtoll(s, &endptr, 10);
    if (endptr[0] != '\0') return 0;
    snprintf(buf,32,"%lld",value);

    /* If the number converted back into a string is not identical
     * then it's not possible to encode the string as integer */
A
antirez 已提交
3188
    if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211

    /* Finally check if it fits in our ranges */
    if (value >= -(1<<7) && value <= (1<<7)-1) {
        enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
        enc[1] = value&0xFF;
        return 2;
    } else if (value >= -(1<<15) && value <= (1<<15)-1) {
        enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
        enc[1] = value&0xFF;
        enc[2] = (value>>8)&0xFF;
        return 3;
    } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
        enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
        enc[1] = value&0xFF;
        enc[2] = (value>>8)&0xFF;
        enc[3] = (value>>16)&0xFF;
        enc[4] = (value>>24)&0xFF;
        return 5;
    } else {
        return 0;
    }
}

A
antirez 已提交
3212 3213
static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
    size_t comprlen, outlen;
A
antirez 已提交
3214 3215 3216 3217
    unsigned char byte;
    void *out;

    /* We require at least four bytes compression for this to be worth it */
A
antirez 已提交
3218 3219
    if (len <= 4) return 0;
    outlen = len-4;
A
antirez 已提交
3220
    if ((out = zmalloc(outlen+1)) == NULL) return 0;
A
antirez 已提交
3221
    comprlen = lzf_compress(s, len, out, outlen);
A
antirez 已提交
3222
    if (comprlen == 0) {
3223
        zfree(out);
A
antirez 已提交
3224 3225 3226 3227 3228 3229
        return 0;
    }
    /* Data compressed! Let's save it on disk */
    byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
    if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
    if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
A
antirez 已提交
3230
    if (rdbSaveLen(fp,len) == -1) goto writeerr;
A
antirez 已提交
3231
    if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
3232
    zfree(out);
A
antirez 已提交
3233 3234 3235
    return comprlen;

writeerr:
3236
    zfree(out);
A
antirez 已提交
3237 3238 3239
    return -1;
}

3240 3241
/* Save a string objet as [len][data] on disk. If the object is a string
 * representation of an integer value we try to safe it in a special form */
A
antirez 已提交
3242
static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
3243
    int enclen;
3244

A
antirez 已提交
3245
    /* Try integer encoding */
3246 3247
    if (len <= 11) {
        unsigned char buf[5];
A
antirez 已提交
3248
        if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
3249 3250 3251 3252
            if (fwrite(buf,enclen,1,fp) == 0) return -1;
            return 0;
        }
    }
A
antirez 已提交
3253 3254

    /* Try LZF compression - under 20 bytes it's unable to compress even
3255
     * aaaaaaaaaaaaaaaaaa so skip it */
3256
    if (server.rdbcompression && len > 20) {
A
antirez 已提交
3257 3258
        int retval;

A
antirez 已提交
3259
        retval = rdbSaveLzfStringObject(fp,s,len);
A
antirez 已提交
3260 3261 3262 3263 3264 3265
        if (retval == -1) return -1;
        if (retval > 0) return 0;
        /* retval == 0 means data can't be compressed, save the old way */
    }

    /* Store verbatim */
3266
    if (rdbSaveLen(fp,len) == -1) return -1;
A
antirez 已提交
3267
    if (len && fwrite(s,len,1,fp) == 0) return -1;
3268 3269 3270
    return 0;
}

3271 3272 3273 3274
/* Like rdbSaveStringObjectRaw() but handle encoded objects */
static int rdbSaveStringObject(FILE *fp, robj *obj) {
    int retval;

A
antirez 已提交
3275 3276 3277 3278 3279 3280
    /* Avoid incr/decr ref count business when possible.
     * This plays well with copy-on-write given that we are probably
     * in a child process (BGSAVE). Also this makes sure key objects
     * of swapped objects are not incRefCount-ed (an assert does not allow
     * this in order to avoid bugs) */
    if (obj->encoding != REDIS_ENCODING_RAW) {
3281
        obj = getDecodedObject(obj);
A
antirez 已提交
3282
        retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3283 3284
        decrRefCount(obj);
    } else {
A
antirez 已提交
3285
        retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
3286
    }
3287
    return retval;
3288 3289
}

3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308
/* Save a double value. Doubles are saved as strings prefixed by an unsigned
 * 8 bit integer specifing the length of the representation.
 * This 8 bit integer has special values in order to specify the following
 * conditions:
 * 253: not a number
 * 254: + inf
 * 255: - inf
 */
static int rdbSaveDoubleValue(FILE *fp, double val) {
    unsigned char buf[128];
    int len;

    if (isnan(val)) {
        buf[0] = 253;
        len = 1;
    } else if (!isfinite(val)) {
        len = 1;
        buf[0] = (val < 0) ? 255 : 254;
    } else {
3309
        snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
A
antirez 已提交
3310
        buf[0] = strlen((char*)buf+1);
3311 3312 3313 3314 3315 3316
        len = buf[0]+1;
    }
    if (fwrite(buf,len,1,fp) == 0) return -1;
    return 0;
}

3317 3318 3319 3320 3321 3322 3323 3324
/* Save a Redis object. */
static int rdbSaveObject(FILE *fp, robj *o) {
    if (o->type == REDIS_STRING) {
        /* Save a string value */
        if (rdbSaveStringObject(fp,o) == -1) return -1;
    } else if (o->type == REDIS_LIST) {
        /* Save a list value */
        list *list = o->ptr;
A
antirez 已提交
3325
        listIter li;
3326 3327 3328
        listNode *ln;

        if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
A
antirez 已提交
3329 3330
        listRewind(list,&li);
        while((ln = listNext(&li))) {
3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362
            robj *eleobj = listNodeValue(ln);

            if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
        }
    } else if (o->type == REDIS_SET) {
        /* Save a set value */
        dict *set = o->ptr;
        dictIterator *di = dictGetIterator(set);
        dictEntry *de;

        if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
        while((de = dictNext(di)) != NULL) {
            robj *eleobj = dictGetEntryKey(de);

            if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
        }
        dictReleaseIterator(di);
    } else if (o->type == REDIS_ZSET) {
        /* Save a set value */
        zset *zs = o->ptr;
        dictIterator *di = dictGetIterator(zs->dict);
        dictEntry *de;

        if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
        while((de = dictNext(di)) != NULL) {
            robj *eleobj = dictGetEntryKey(de);
            double *score = dictGetEntryVal(de);

            if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
            if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
        }
        dictReleaseIterator(di);
A
antirez 已提交
3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389
    } else if (o->type == REDIS_HASH) {
        /* Save a hash value */
        if (o->encoding == REDIS_ENCODING_ZIPMAP) {
            unsigned char *p = zipmapRewind(o->ptr);
            unsigned int count = zipmapLen(o->ptr);
            unsigned char *key, *val;
            unsigned int klen, vlen;

            if (rdbSaveLen(fp,count) == -1) return -1;
            while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
                if (rdbSaveRawString(fp,key,klen) == -1) return -1;
                if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
            }
        } else {
            dictIterator *di = dictGetIterator(o->ptr);
            dictEntry *de;

            if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
            while((de = dictNext(di)) != NULL) {
                robj *key = dictGetEntryKey(de);
                robj *val = dictGetEntryVal(de);

                if (rdbSaveStringObject(fp,key) == -1) return -1;
                if (rdbSaveStringObject(fp,val) == -1) return -1;
            }
            dictReleaseIterator(di);
        }
3390
    } else {
A
antirez 已提交
3391
        redisAssert(0);
3392 3393 3394 3395 3396 3397 3398 3399
    }
    return 0;
}

/* Return the length the object will have on disk if saved with
 * the rdbSaveObject() function. Currently we use a trick to get
 * this length with very little changes to the code. In the future
 * we could switch to a faster solution. */
3400 3401
static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
    if (fp == NULL) fp = server.devnull;
3402 3403 3404 3405 3406
    rewind(fp);
    assert(rdbSaveObject(fp,o) != 1);
    return ftello(fp);
}

A
antirez 已提交
3407
/* Return the number of pages required to save this object in the swap file */
3408 3409
static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
    off_t bytes = rdbSavedObjectLen(o,fp);
A
Alex McHale 已提交
3410

A
antirez 已提交
3411 3412 3413
    return (bytes+(server.vm_page_size-1))/server.vm_page_size;
}

A
antirez 已提交
3414
/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
3415
static int rdbSave(char *filename) {
A
antirez 已提交
3416 3417 3418 3419 3420
    dictIterator *di = NULL;
    dictEntry *de;
    FILE *fp;
    char tmpfile[256];
    int j;
A
antirez 已提交
3421
    time_t now = time(NULL);
A
antirez 已提交
3422

A
antirez 已提交
3423 3424 3425 3426 3427 3428
    /* Wait for I/O therads to terminate, just in case this is a
     * foreground-saving, to avoid seeking the swap file descriptor at the
     * same time. */
    if (server.vm_enabled)
        waitEmptyIOJobsQueue();

3429
    snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
A
antirez 已提交
3430 3431 3432 3433 3434
    fp = fopen(tmpfile,"w");
    if (!fp) {
        redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
        return REDIS_ERR;
    }
3435
    if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
A
antirez 已提交
3436
    for (j = 0; j < server.dbnum; j++) {
A
antirez 已提交
3437 3438
        redisDb *db = server.db+j;
        dict *d = db->dict;
A
antirez 已提交
3439
        if (dictSize(d) == 0) continue;
A
antirez 已提交
3440 3441 3442 3443 3444 3445 3446
        di = dictGetIterator(d);
        if (!di) {
            fclose(fp);
            return REDIS_ERR;
        }

        /* Write the SELECT DB opcode */
3447 3448
        if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
        if (rdbSaveLen(fp,j) == -1) goto werr;
A
antirez 已提交
3449 3450 3451 3452 3453

        /* Iterate this DB writing every entry */
        while((de = dictNext(di)) != NULL) {
            robj *key = dictGetEntryKey(de);
            robj *o = dictGetEntryVal(de);
A
antirez 已提交
3454 3455 3456 3457 3458 3459 3460 3461 3462
            time_t expiretime = getExpire(db,key);

            /* Save the expire time */
            if (expiretime != -1) {
                /* If this key is already expired skip it */
                if (expiretime < now) continue;
                if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
                if (rdbSaveTime(fp,expiretime) == -1) goto werr;
            }
A
antirez 已提交
3463 3464
            /* Save the key and associated value. This requires special
             * handling if the value is swapped out. */
3465 3466
            if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
                                      key->storage == REDIS_VM_SWAPPING) {
A
antirez 已提交
3467 3468 3469 3470 3471
                /* Save type, key, value */
                if (rdbSaveType(fp,o->type) == -1) goto werr;
                if (rdbSaveStringObject(fp,key) == -1) goto werr;
                if (rdbSaveObject(fp,o) == -1) goto werr;
            } else {
3472
                /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
3473
                robj *po;
A
antirez 已提交
3474 3475 3476 3477
                /* Get a preview of the object in memory */
                po = vmPreviewObject(key);
                /* Save type, key, value */
                if (rdbSaveType(fp,key->vtype) == -1) goto werr;
3478
                if (rdbSaveStringObject(fp,key) == -1) goto werr;
A
antirez 已提交
3479 3480 3481 3482
                if (rdbSaveObject(fp,po) == -1) goto werr;
                /* Remove the loaded object from memory */
                decrRefCount(po);
            }
A
antirez 已提交
3483 3484 3485 3486
        }
        dictReleaseIterator(di);
    }
    /* EOF opcode */
3487 3488 3489
    if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;

    /* Make sure data will not remain on the OS's output buffers */
A
antirez 已提交
3490 3491 3492
    fflush(fp);
    fsync(fileno(fp));
    fclose(fp);
A
Alex McHale 已提交
3493

A
antirez 已提交
3494 3495 3496
    /* Use RENAME to make sure the DB file is changed atomically only
     * if the generate DB file is ok. */
    if (rename(tmpfile,filename) == -1) {
3497
        redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
A
antirez 已提交
3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513
        unlink(tmpfile);
        return REDIS_ERR;
    }
    redisLog(REDIS_NOTICE,"DB saved on disk");
    server.dirty = 0;
    server.lastsave = time(NULL);
    return REDIS_OK;

werr:
    fclose(fp);
    unlink(tmpfile);
    redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
    if (di) dictReleaseIterator(di);
    return REDIS_ERR;
}

3514
static int rdbSaveBackground(char *filename) {
A
antirez 已提交
3515 3516
    pid_t childpid;

3517
    if (server.bgsavechildpid != -1) return REDIS_ERR;
3518
    if (server.vm_enabled) waitEmptyIOJobsQueue();
A
antirez 已提交
3519 3520
    if ((childpid = fork()) == 0) {
        /* Child */
3521
        if (server.vm_enabled) vmReopenSwapFile();
A
antirez 已提交
3522
        close(server.fd);
3523
        if (rdbSave(filename) == REDIS_OK) {
3524
            _exit(0);
A
antirez 已提交
3525
        } else {
3526
            _exit(1);
A
antirez 已提交
3527 3528 3529
        }
    } else {
        /* Parent */
3530 3531 3532 3533 3534
        if (childpid == -1) {
            redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
                strerror(errno));
            return REDIS_ERR;
        }
A
antirez 已提交
3535
        redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
3536
        server.bgsavechildpid = childpid;
3537
        updateDictResizePolicy();
A
antirez 已提交
3538 3539 3540 3541 3542
        return REDIS_OK;
    }
    return REDIS_OK; /* unreached */
}

3543 3544 3545 3546 3547 3548 3549
static void rdbRemoveTempFile(pid_t childpid) {
    char tmpfile[256];

    snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
    unlink(tmpfile);
}

3550 3551
static int rdbLoadType(FILE *fp) {
    unsigned char type;
A
antirez 已提交
3552 3553 3554 3555
    if (fread(&type,1,1,fp) == 0) return -1;
    return type;
}

A
antirez 已提交
3556 3557 3558 3559 3560 3561
static time_t rdbLoadTime(FILE *fp) {
    int32_t t32;
    if (fread(&t32,4,1,fp) == 0) return -1;
    return (time_t) t32;
}

3562 3563 3564 3565 3566
/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
 * of this file for a description of how this are stored on disk.
 *
 * isencoded is set to 1 if the readed length is not actually a length but
 * an "encoding type", check the above comments for more info */
3567
static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
3568 3569
    unsigned char buf[2];
    uint32_t len;
3570
    int type;
3571

3572
    if (isencoded) *isencoded = 0;
3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587
    if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
    type = (buf[0]&0xC0)>>6;
    if (type == REDIS_RDB_6BITLEN) {
        /* Read a 6 bit len */
        return buf[0]&0x3F;
    } else if (type == REDIS_RDB_ENCVAL) {
        /* Read a 6 bit len encoding type */
        if (isencoded) *isencoded = 1;
        return buf[0]&0x3F;
    } else if (type == REDIS_RDB_14BITLEN) {
        /* Read a 14 bit len */
        if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
        return ((buf[0]&0x3F)<<8)|buf[1];
    } else {
        /* Read a 32 bit len */
3588 3589 3590 3591 3592
        if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
        return ntohl(len);
    }
}

3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611
static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
    unsigned char enc[4];
    long long val;

    if (enctype == REDIS_RDB_ENC_INT8) {
        if (fread(enc,1,1,fp) == 0) return NULL;
        val = (signed char)enc[0];
    } else if (enctype == REDIS_RDB_ENC_INT16) {
        uint16_t v;
        if (fread(enc,2,1,fp) == 0) return NULL;
        v = enc[0]|(enc[1]<<8);
        val = (int16_t)v;
    } else if (enctype == REDIS_RDB_ENC_INT32) {
        uint32_t v;
        if (fread(enc,4,1,fp) == 0) return NULL;
        v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
        val = (int32_t)v;
    } else {
        val = 0; /* anti-warning */
A
antirez 已提交
3612
        redisAssert(0);
3613 3614 3615 3616
    }
    return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
}

3617
static robj *rdbLoadLzfStringObject(FILE*fp) {
3618 3619 3620 3621
    unsigned int len, clen;
    unsigned char *c = NULL;
    sds val = NULL;

3622 3623
    if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
    if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3624 3625 3626 3627
    if ((c = zmalloc(clen)) == NULL) goto err;
    if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
    if (fread(c,clen,1,fp) == 0) goto err;
    if (lzf_decompress(c,clen,val,len) == 0) goto err;
3628
    zfree(c);
3629 3630 3631 3632 3633 3634 3635
    return createObject(REDIS_STRING,val);
err:
    zfree(c);
    sdsfree(val);
    return NULL;
}

3636
static robj *rdbLoadStringObject(FILE*fp) {
3637 3638
    int isencoded;
    uint32_t len;
3639 3640
    sds val;

3641
    len = rdbLoadLen(fp,&isencoded);
3642 3643 3644 3645 3646
    if (isencoded) {
        switch(len) {
        case REDIS_RDB_ENC_INT8:
        case REDIS_RDB_ENC_INT16:
        case REDIS_RDB_ENC_INT32:
3647
            return rdbLoadIntegerObject(fp,len);
3648
        case REDIS_RDB_ENC_LZF:
3649
            return rdbLoadLzfStringObject(fp);
3650
        default:
A
antirez 已提交
3651
            redisAssert(0);
3652 3653 3654
        }
    }

3655 3656 3657 3658 3659 3660
    if (len == REDIS_RDB_LENERR) return NULL;
    val = sdsnewlen(NULL,len);
    if (len && fread(val,len,1,fp) == 0) {
        sdsfree(val);
        return NULL;
    }
3661
    return createObject(REDIS_STRING,val);
3662 3663
}

3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675
/* For information about double serialization check rdbSaveDoubleValue() */
static int rdbLoadDoubleValue(FILE *fp, double *val) {
    char buf[128];
    unsigned char len;

    if (fread(&len,1,1,fp) == 0) return -1;
    switch(len) {
    case 255: *val = R_NegInf; return 0;
    case 254: *val = R_PosInf; return 0;
    case 253: *val = R_Nan; return 0;
    default:
        if (fread(buf,len,1,fp) == 0) return -1;
3676
        buf[len] = '\0';
3677 3678 3679 3680 3681
        sscanf(buf, "%lg", val);
        return 0;
    }
}

3682 3683 3684 3685 3686
/* Load a Redis object of the specified type from the specified file.
 * On success a newly allocated object is returned, otherwise NULL. */
static robj *rdbLoadObject(int type, FILE *fp) {
    robj *o;

3687
    redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
3688 3689 3690
    if (type == REDIS_STRING) {
        /* Read string value */
        if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3691
        o = tryObjectEncoding(o);
3692 3693 3694 3695 3696 3697
    } else if (type == REDIS_LIST || type == REDIS_SET) {
        /* Read list/set value */
        uint32_t listlen;

        if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
        o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3698 3699 3700 3701
        /* It's faster to expand the dict to the right size asap in order
         * to avoid rehashing */
        if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
            dictExpand(o->ptr,listlen);
3702 3703 3704 3705 3706
        /* Load every single element of the list/set */
        while(listlen--) {
            robj *ele;

            if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3707
            ele = tryObjectEncoding(ele);
3708 3709 3710 3711 3712 3713 3714 3715
            if (type == REDIS_LIST) {
                listAddNodeTail((list*)o->ptr,ele);
            } else {
                dictAdd((dict*)o->ptr,ele,NULL);
            }
        }
    } else if (type == REDIS_ZSET) {
        /* Read list/set value */
3716
        size_t zsetlen;
3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727
        zset *zs;

        if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
        o = createZsetObject();
        zs = o->ptr;
        /* Load every single element of the list/set */
        while(zsetlen--) {
            robj *ele;
            double *score = zmalloc(sizeof(double));

            if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3728
            ele = tryObjectEncoding(ele);
3729 3730 3731 3732 3733
            if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
            dictAdd(zs->dict,ele,score);
            zslInsert(zs->zsl,*score,ele);
            incrRefCount(ele); /* added to skiplist */
        }
3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766
    } else if (type == REDIS_HASH) {
        size_t hashlen;

        if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
        o = createHashObject();
        /* Too many entries? Use an hash table. */
        if (hashlen > server.hash_max_zipmap_entries)
            convertToRealHash(o);
        /* Load every key/value, then set it into the zipmap or hash
         * table, as needed. */
        while(hashlen--) {
            robj *key, *val;

            if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
            if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
            /* If we are using a zipmap and there are too big values
             * the object is converted to real hash table encoding. */
            if (o->encoding != REDIS_ENCODING_HT &&
               (sdslen(key->ptr) > server.hash_max_zipmap_value ||
                sdslen(val->ptr) > server.hash_max_zipmap_value))
            {
                    convertToRealHash(o);
            }

            if (o->encoding == REDIS_ENCODING_ZIPMAP) {
                unsigned char *zm = o->ptr;

                zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
                                  val->ptr,sdslen(val->ptr),NULL);
                o->ptr = zm;
                decrRefCount(key);
                decrRefCount(val);
            } else {
3767 3768
                key = tryObjectEncoding(key);
                val = tryObjectEncoding(val);
3769 3770 3771
                dictAdd((dict*)o->ptr,key,val);
            }
        }
3772
    } else {
A
antirez 已提交
3773
        redisAssert(0);
3774 3775 3776 3777
    }
    return o;
}

3778
static int rdbLoad(char *filename) {
A
antirez 已提交
3779
    FILE *fp;
3780 3781
    robj *keyobj = NULL;
    uint32_t dbid;
A
antirez 已提交
3782
    int type, retval, rdbver;
A
antirez 已提交
3783
    dict *d = server.db[0].dict;
A
antirez 已提交
3784
    redisDb *db = server.db+0;
3785
    char buf[1024];
A
antirez 已提交
3786
    time_t expiretime = -1, now = time(NULL);
3787
    long long loadedkeys = 0;
A
antirez 已提交
3788

A
antirez 已提交
3789 3790 3791
    fp = fopen(filename,"r");
    if (!fp) return REDIS_ERR;
    if (fread(buf,9,1,fp) == 0) goto eoferr;
3792 3793
    buf[9] = '\0';
    if (memcmp(buf,"REDIS",5) != 0) {
A
antirez 已提交
3794 3795 3796 3797
        fclose(fp);
        redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
        return REDIS_ERR;
    }
3798
    rdbver = atoi(buf+5);
3799
    if (rdbver != 1) {
3800 3801 3802 3803
        fclose(fp);
        redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
        return REDIS_ERR;
    }
A
antirez 已提交
3804 3805 3806 3807
    while(1) {
        robj *o;

        /* Read type. */
3808
        if ((type = rdbLoadType(fp)) == -1) goto eoferr;
A
antirez 已提交
3809 3810 3811 3812 3813
        if (type == REDIS_EXPIRETIME) {
            if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
            /* We read the time so we need to read the object type again */
            if ((type = rdbLoadType(fp)) == -1) goto eoferr;
        }
A
antirez 已提交
3814 3815 3816
        if (type == REDIS_EOF) break;
        /* Handle SELECT DB opcode as a special case */
        if (type == REDIS_SELECTDB) {
3817
            if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
3818
                goto eoferr;
A
antirez 已提交
3819
            if (dbid >= (unsigned)server.dbnum) {
3820
                redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
A
antirez 已提交
3821 3822
                exit(1);
            }
A
antirez 已提交
3823 3824
            db = server.db+dbid;
            d = db->dict;
A
antirez 已提交
3825 3826 3827
            continue;
        }
        /* Read key */
3828 3829 3830
        if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
        /* Read value */
        if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
A
antirez 已提交
3831
        /* Add the new object in the hash table */
3832
        retval = dictAdd(d,keyobj,o);
A
antirez 已提交
3833
        if (retval == DICT_ERR) {
3834
            redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
A
antirez 已提交
3835 3836
            exit(1);
        }
A
antirez 已提交
3837 3838 3839 3840 3841 3842 3843
        /* Set the expire time if needed */
        if (expiretime != -1) {
            setExpire(db,keyobj,expiretime);
            /* Delete this key if already expired */
            if (expiretime < now) deleteKey(db,keyobj);
            expiretime = -1;
        }
3844
        keyobj = o = NULL;
3845 3846 3847 3848
        /* Handle swapping while loading big datasets when VM is on */
        loadedkeys++;
        if (server.vm_enabled && (loadedkeys % 5000) == 0) {
            while (zmalloc_used_memory() > server.vm_max_memory) {
3849
                if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
3850 3851
            }
        }
A
antirez 已提交
3852 3853 3854 3855 3856
    }
    fclose(fp);
    return REDIS_OK;

eoferr: /* unexpected end of file is handled here with a fatal exit */
3857
    if (keyobj) decrRefCount(keyobj);
3858
    redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
A
antirez 已提交
3859 3860 3861 3862 3863 3864
    exit(1);
    return REDIS_ERR; /* Just to avoid warning */
}

/*================================== Commands =============================== */

B
Brian Hammond 已提交
3865
static void authCommand(redisClient *c) {
A
antirez 已提交
3866
    if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
B
Brian Hammond 已提交
3867 3868 3869 3870
      c->authenticated = 1;
      addReply(c,shared.ok);
    } else {
      c->authenticated = 0;
3871
      addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
B
Brian Hammond 已提交
3872 3873 3874
    }
}

A
antirez 已提交
3875 3876 3877 3878 3879
static void pingCommand(redisClient *c) {
    addReply(c,shared.pong);
}

static void echoCommand(redisClient *c) {
A
antirez 已提交
3880
    addReplyBulk(c,c->argv[1]);
A
antirez 已提交
3881 3882 3883 3884 3885 3886 3887
}

/*=================================== Strings =============================== */

static void setGenericCommand(redisClient *c, int nx) {
    int retval;

A
antirez 已提交
3888
    if (nx) deleteIfVolatile(c->db,c->argv[1]);
A
antirez 已提交
3889
    retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
A
antirez 已提交
3890 3891
    if (retval == DICT_ERR) {
        if (!nx) {
A
antirez 已提交
3892 3893 3894 3895
            /* If the key is about a swapped value, we want a new key object
             * to overwrite the old. So we delete the old key in the database.
             * This will also make sure that swap pages about the old object
             * will be marked as free. */
3896
            if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
A
antirez 已提交
3897
                incrRefCount(c->argv[1]);
A
antirez 已提交
3898
            dictReplace(c->db->dict,c->argv[1],c->argv[2]);
A
antirez 已提交
3899 3900
            incrRefCount(c->argv[2]);
        } else {
3901
            addReply(c,shared.czero);
A
antirez 已提交
3902 3903 3904 3905 3906 3907 3908
            return;
        }
    } else {
        incrRefCount(c->argv[1]);
        incrRefCount(c->argv[2]);
    }
    server.dirty++;
A
antirez 已提交
3909
    removeExpire(c->db,c->argv[1]);
3910
    addReply(c, nx ? shared.cone : shared.ok);
A
antirez 已提交
3911 3912 3913
}

static void setCommand(redisClient *c) {
A
antirez 已提交
3914
    setGenericCommand(c,0);
A
antirez 已提交
3915 3916 3917
}

static void setnxCommand(redisClient *c) {
A
antirez 已提交
3918
    setGenericCommand(c,1);
A
antirez 已提交
3919 3920
}

3921
static int getGenericCommand(redisClient *c) {
A
antirez 已提交
3922
    robj *o;
A
Alex McHale 已提交
3923

A
antirez 已提交
3924
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
3925
        return REDIS_OK;
A
antirez 已提交
3926 3927 3928 3929

    if (o->type != REDIS_STRING) {
        addReply(c,shared.wrongtypeerr);
        return REDIS_ERR;
A
antirez 已提交
3930
    } else {
A
antirez 已提交
3931 3932
        addReplyBulk(c,o);
        return REDIS_OK;
A
antirez 已提交
3933 3934 3935
    }
}

3936 3937 3938 3939
static void getCommand(redisClient *c) {
    getGenericCommand(c);
}

A
antirez 已提交
3940
static void getsetCommand(redisClient *c) {
3941
    if (getGenericCommand(c) == REDIS_ERR) return;
A
antirez 已提交
3942 3943 3944 3945 3946 3947 3948 3949 3950 3951
    if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
        dictReplace(c->db->dict,c->argv[1],c->argv[2]);
    } else {
        incrRefCount(c->argv[1]);
    }
    incrRefCount(c->argv[2]);
    server.dirty++;
    removeExpire(c->db,c->argv[1]);
}

3952 3953
static void mgetCommand(redisClient *c) {
    int j;
A
Alex McHale 已提交
3954

3955
    addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
3956
    for (j = 1; j < c->argc; j++) {
A
antirez 已提交
3957 3958
        robj *o = lookupKeyRead(c->db,c->argv[j]);
        if (o == NULL) {
3959
            addReply(c,shared.nullbulk);
3960 3961
        } else {
            if (o->type != REDIS_STRING) {
3962
                addReply(c,shared.nullbulk);
3963
            } else {
A
antirez 已提交
3964
                addReplyBulk(c,o);
3965 3966 3967 3968 3969
            }
        }
    }
}

A
antirez 已提交
3970
static void msetGenericCommand(redisClient *c, int nx) {
3971
    int j, busykeys = 0;
A
antirez 已提交
3972 3973

    if ((c->argc % 2) == 0) {
3974
        addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
A
antirez 已提交
3975 3976 3977 3978 3979 3980
        return;
    }
    /* Handle the NX flag. The MSETNX semantic is to return zero and don't
     * set nothing at all if at least one already key exists. */
    if (nx) {
        for (j = 1; j < c->argc; j += 2) {
3981 3982
            if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
                busykeys++;
A
antirez 已提交
3983 3984 3985
            }
        }
    }
3986 3987 3988 3989
    if (busykeys) {
        addReply(c, shared.czero);
        return;
    }
A
antirez 已提交
3990 3991 3992 3993

    for (j = 1; j < c->argc; j += 2) {
        int retval;

3994
        c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
A
antirez 已提交
3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016
        retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
        if (retval == DICT_ERR) {
            dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
            incrRefCount(c->argv[j+1]);
        } else {
            incrRefCount(c->argv[j]);
            incrRefCount(c->argv[j+1]);
        }
        removeExpire(c->db,c->argv[j]);
    }
    server.dirty += (c->argc-1)/2;
    addReply(c, nx ? shared.cone : shared.ok);
}

static void msetCommand(redisClient *c) {
    msetGenericCommand(c,0);
}

static void msetnxCommand(redisClient *c) {
    msetGenericCommand(c,1);
}

4017
static void incrDecrCommand(redisClient *c, long long incr) {
A
antirez 已提交
4018 4019 4020
    long long value;
    int retval;
    robj *o;
A
Alex McHale 已提交
4021

A
antirez 已提交
4022
    o = lookupKeyWrite(c->db,c->argv[1]);
A
antirez 已提交
4023

A
Alex McHale 已提交
4024
    if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
A
antirez 已提交
4025 4026 4027

    value += incr;
    o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
4028
    o = tryObjectEncoding(o);
A
antirez 已提交
4029
    retval = dictAdd(c->db->dict,c->argv[1],o);
A
antirez 已提交
4030
    if (retval == DICT_ERR) {
A
antirez 已提交
4031 4032
        dictReplace(c->db->dict,c->argv[1],o);
        removeExpire(c->db,c->argv[1]);
A
antirez 已提交
4033 4034 4035 4036
    } else {
        incrRefCount(c->argv[1]);
    }
    server.dirty++;
4037
    addReply(c,shared.colon);
A
antirez 已提交
4038 4039 4040 4041 4042
    addReply(c,o);
    addReply(c,shared.crlf);
}

static void incrCommand(redisClient *c) {
A
antirez 已提交
4043
    incrDecrCommand(c,1);
A
antirez 已提交
4044 4045 4046
}

static void decrCommand(redisClient *c) {
A
antirez 已提交
4047
    incrDecrCommand(c,-1);
A
antirez 已提交
4048 4049 4050
}

static void incrbyCommand(redisClient *c) {
A
Alex McHale 已提交
4051 4052 4053 4054
    long long incr;

    if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;

A
antirez 已提交
4055
    incrDecrCommand(c,incr);
A
antirez 已提交
4056 4057 4058
}

static void decrbyCommand(redisClient *c) {
A
Alex McHale 已提交
4059 4060 4061 4062
    long long incr;

    if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;

A
antirez 已提交
4063
    incrDecrCommand(c,-incr);
A
antirez 已提交
4064 4065
}

A
antirez 已提交
4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079
static void appendCommand(redisClient *c) {
    int retval;
    size_t totlen;
    robj *o;

    o = lookupKeyWrite(c->db,c->argv[1]);
    if (o == NULL) {
        /* Create the key */
        retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
        incrRefCount(c->argv[1]);
        incrRefCount(c->argv[2]);
        totlen = stringObjectLen(c->argv[2]);
    } else {
        dictEntry *de;
A
Alex McHale 已提交
4080

A
antirez 已提交
4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111
        de = dictFind(c->db->dict,c->argv[1]);
        assert(de != NULL);

        o = dictGetEntryVal(de);
        if (o->type != REDIS_STRING) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
        /* If the object is specially encoded or shared we have to make
         * a copy */
        if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
            robj *decoded = getDecodedObject(o);

            o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
            decrRefCount(decoded);
            dictReplace(c->db->dict,c->argv[1],o);
        }
        /* APPEND! */
        if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
            o->ptr = sdscatlen(o->ptr,
                c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
        } else {
            o->ptr = sdscatprintf(o->ptr, "%ld",
                (unsigned long) c->argv[2]->ptr);
        }
        totlen = sdslen(o->ptr);
    }
    server.dirty++;
    addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
}

A
antirez 已提交
4112 4113 4114 4115
static void substrCommand(redisClient *c) {
    robj *o;
    long start = atoi(c->argv[2]->ptr);
    long end = atoi(c->argv[3]->ptr);
A
antirez 已提交
4116 4117
    size_t rangelen, strlen;
    sds range;
A
antirez 已提交
4118

A
antirez 已提交
4119 4120
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_STRING)) return;
A
antirez 已提交
4121

A
antirez 已提交
4122 4123
    o = getDecodedObject(o);
    strlen = sdslen(o->ptr);
A
antirez 已提交
4124

A
antirez 已提交
4125 4126 4127 4128 4129
    /* convert negative indexes */
    if (start < 0) start = strlen+start;
    if (end < 0) end = strlen+end;
    if (start < 0) start = 0;
    if (end < 0) end = 0;
A
antirez 已提交
4130

A
antirez 已提交
4131 4132 4133 4134 4135 4136
    /* indexes sanity checks */
    if (start > end || (size_t)start >= strlen) {
        /* Out of range start or start > end result in null reply */
        addReply(c,shared.nullbulk);
        decrRefCount(o);
        return;
A
antirez 已提交
4137
    }
A
antirez 已提交
4138 4139 4140 4141 4142 4143 4144 4145 4146
    if ((size_t)end >= strlen) end = strlen-1;
    rangelen = (end-start)+1;

    /* Return the result */
    addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
    range = sdsnewlen((char*)o->ptr+start,rangelen);
    addReplySds(c,range);
    addReply(c,shared.crlf);
    decrRefCount(o);
A
antirez 已提交
4147 4148
}

A
antirez 已提交
4149 4150 4151
/* ========================= Type agnostic commands ========================= */

static void delCommand(redisClient *c) {
4152 4153 4154 4155 4156 4157 4158 4159
    int deleted = 0, j;

    for (j = 1; j < c->argc; j++) {
        if (deleteKey(c->db,c->argv[j])) {
            server.dirty++;
            deleted++;
        }
    }
A
antirez 已提交
4160
    addReplyLong(c,deleted);
A
antirez 已提交
4161 4162 4163
}

static void existsCommand(redisClient *c) {
A
antirez 已提交
4164
    addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
A
antirez 已提交
4165 4166 4167 4168
}

static void selectCommand(redisClient *c) {
    int id = atoi(c->argv[1]->ptr);
A
Alex McHale 已提交
4169

A
antirez 已提交
4170
    if (selectDb(c,id) == REDIS_ERR) {
A
antirez 已提交
4171
        addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
A
antirez 已提交
4172 4173 4174 4175 4176 4177 4178
    } else {
        addReply(c,shared.ok);
    }
}

static void randomkeyCommand(redisClient *c) {
    dictEntry *de;
A
Alex McHale 已提交
4179

A
antirez 已提交
4180 4181
    while(1) {
        de = dictGetRandomKey(c->db->dict);
4182
        if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
A
antirez 已提交
4183
    }
A
antirez 已提交
4184
    if (de == NULL) {
4185
        addReply(c,shared.plus);
A
antirez 已提交
4186 4187
        addReply(c,shared.crlf);
    } else {
4188
        addReply(c,shared.plus);
A
antirez 已提交
4189 4190 4191 4192 4193 4194 4195 4196 4197 4198
        addReply(c,dictGetEntryKey(de));
        addReply(c,shared.crlf);
    }
}

static void keysCommand(redisClient *c) {
    dictIterator *di;
    dictEntry *de;
    sds pattern = c->argv[1]->ptr;
    int plen = sdslen(pattern);
A
antirez 已提交
4199
    unsigned long numkeys = 0;
A
antirez 已提交
4200 4201
    robj *lenobj = createObject(REDIS_STRING,NULL);

A
antirez 已提交
4202
    di = dictGetIterator(c->db->dict);
A
antirez 已提交
4203 4204 4205 4206
    addReply(c,lenobj);
    decrRefCount(lenobj);
    while((de = dictNext(di)) != NULL) {
        robj *keyobj = dictGetEntryKey(de);
A
antirez 已提交
4207

A
antirez 已提交
4208 4209 4210
        sds key = keyobj->ptr;
        if ((pattern[0] == '*' && pattern[1] == '\0') ||
            stringmatchlen(pattern,plen,key,sdslen(key),0)) {
A
antirez 已提交
4211
            if (expireIfNeeded(c->db,keyobj) == 0) {
A
antirez 已提交
4212
                addReplyBulk(c,keyobj);
A
antirez 已提交
4213 4214
                numkeys++;
            }
A
antirez 已提交
4215 4216 4217
        }
    }
    dictReleaseIterator(di);
A
antirez 已提交
4218
    lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
A
antirez 已提交
4219 4220 4221 4222
}

static void dbsizeCommand(redisClient *c) {
    addReplySds(c,
A
antirez 已提交
4223
        sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
A
antirez 已提交
4224 4225 4226 4227
}

static void lastsaveCommand(redisClient *c) {
    addReplySds(c,
4228
        sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
A
antirez 已提交
4229 4230 4231
}

static void typeCommand(redisClient *c) {
A
antirez 已提交
4232
    robj *o;
A
antirez 已提交
4233
    char *type;
A
antirez 已提交
4234 4235 4236

    o = lookupKeyRead(c->db,c->argv[1]);
    if (o == NULL) {
4237
        type = "+none";
A
antirez 已提交
4238 4239
    } else {
        switch(o->type) {
4240 4241 4242
        case REDIS_STRING: type = "+string"; break;
        case REDIS_LIST: type = "+list"; break;
        case REDIS_SET: type = "+set"; break;
4243
        case REDIS_ZSET: type = "+zset"; break;
4244 4245
        case REDIS_HASH: type = "+hash"; break;
        default: type = "+unknown"; break;
A
antirez 已提交
4246 4247 4248 4249 4250 4251 4252
        }
    }
    addReplySds(c,sdsnew(type));
    addReply(c,shared.crlf);
}

static void saveCommand(redisClient *c) {
4253
    if (server.bgsavechildpid != -1) {
4254 4255 4256
        addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
        return;
    }
4257
    if (rdbSave(server.dbfilename) == REDIS_OK) {
A
antirez 已提交
4258 4259 4260 4261 4262 4263 4264
        addReply(c,shared.ok);
    } else {
        addReply(c,shared.err);
    }
}

static void bgsaveCommand(redisClient *c) {
4265
    if (server.bgsavechildpid != -1) {
A
antirez 已提交
4266 4267 4268
        addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
        return;
    }
4269
    if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
4270 4271
        char *status = "+Background saving started\r\n";
        addReplySds(c,sdsnew(status));
A
antirez 已提交
4272 4273 4274 4275 4276 4277 4278
    } else {
        addReply(c,shared.err);
    }
}

static void shutdownCommand(redisClient *c) {
    redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4279 4280 4281
    /* Kill the saving child if there is a background saving in progress.
       We want to avoid race conditions, for instance our saving child may
       overwrite the synchronous saving did by SHUTDOWN. */
4282
    if (server.bgsavechildpid != -1) {
4283 4284
        redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
        kill(server.bgsavechildpid,SIGKILL);
4285
        rdbRemoveTempFile(server.bgsavechildpid);
4286
    }
4287 4288 4289
    if (server.appendonly) {
        /* Append only file: fsync() the AOF and exit */
        fsync(server.appendfd);
4290
        if (server.vm_enabled) unlink(server.vm_swap_file);
4291
        exit(0);
A
antirez 已提交
4292
    } else {
4293 4294 4295 4296 4297 4298
        /* Snapshotting. Perform a SYNC SAVE and exit */
        if (rdbSave(server.dbfilename) == REDIS_OK) {
            if (server.daemonize)
                unlink(server.pidfile);
            redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
            redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4299
            if (server.vm_enabled) unlink(server.vm_swap_file);
4300 4301
            exit(0);
        } else {
A
antirez 已提交
4302 4303 4304 4305 4306
            /* Ooops.. error saving! The best we can do is to continue
             * operating. Note that if there was a background saving process,
             * in the next cron() Redis will be notified that the background
             * saving aborted, handling special stuff like slaves pending for
             * synchronization... */
A
Alex McHale 已提交
4307
            redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
A
antirez 已提交
4308 4309
            addReplySds(c,
                sdsnew("-ERR can't quit, problems saving the DB\r\n"));
4310
        }
A
antirez 已提交
4311 4312 4313 4314 4315 4316 4317 4318
    }
}

static void renameGenericCommand(redisClient *c, int nx) {
    robj *o;

    /* To use the same key as src and dst is probably an error */
    if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
4319
        addReply(c,shared.sameobjecterr);
A
antirez 已提交
4320 4321 4322
        return;
    }

A
antirez 已提交
4323
    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
A
antirez 已提交
4324
        return;
A
antirez 已提交
4325

A
antirez 已提交
4326
    incrRefCount(o);
A
antirez 已提交
4327 4328
    deleteIfVolatile(c->db,c->argv[2]);
    if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
A
antirez 已提交
4329 4330
        if (nx) {
            decrRefCount(o);
4331
            addReply(c,shared.czero);
A
antirez 已提交
4332 4333
            return;
        }
A
antirez 已提交
4334
        dictReplace(c->db->dict,c->argv[2],o);
A
antirez 已提交
4335 4336 4337
    } else {
        incrRefCount(c->argv[2]);
    }
A
antirez 已提交
4338
    deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4339
    server.dirty++;
4340
    addReply(c,nx ? shared.cone : shared.ok);
A
antirez 已提交
4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351
}

static void renameCommand(redisClient *c) {
    renameGenericCommand(c,0);
}

static void renamenxCommand(redisClient *c) {
    renameGenericCommand(c,1);
}

static void moveCommand(redisClient *c) {
A
antirez 已提交
4352 4353
    robj *o;
    redisDb *src, *dst;
A
antirez 已提交
4354 4355 4356
    int srcid;

    /* Obtain source and target DB pointers */
A
antirez 已提交
4357 4358
    src = c->db;
    srcid = c->db->id;
A
antirez 已提交
4359
    if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
4360
        addReply(c,shared.outofrangeerr);
A
antirez 已提交
4361 4362
        return;
    }
A
antirez 已提交
4363 4364
    dst = c->db;
    selectDb(c,srcid); /* Back to the source DB */
A
antirez 已提交
4365 4366 4367 4368

    /* If the user is moving using as target the same
     * DB as the source DB it is probably an error. */
    if (src == dst) {
4369
        addReply(c,shared.sameobjecterr);
A
antirez 已提交
4370 4371 4372 4373
        return;
    }

    /* Check if the element exists and get a reference */
A
antirez 已提交
4374 4375
    o = lookupKeyWrite(c->db,c->argv[1]);
    if (!o) {
4376
        addReply(c,shared.czero);
A
antirez 已提交
4377 4378 4379 4380
        return;
    }

    /* Try to add the element to the target DB */
A
antirez 已提交
4381 4382
    deleteIfVolatile(dst,c->argv[1]);
    if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
4383
        addReply(c,shared.czero);
A
antirez 已提交
4384 4385
        return;
    }
A
antirez 已提交
4386
    incrRefCount(c->argv[1]);
A
antirez 已提交
4387 4388 4389
    incrRefCount(o);

    /* OK! key moved, free the entry in the source DB */
A
antirez 已提交
4390
    deleteKey(src,c->argv[1]);
A
antirez 已提交
4391
    server.dirty++;
4392
    addReply(c,shared.cone);
A
antirez 已提交
4393 4394 4395 4396 4397 4398
}

/* =================================== Lists ================================ */
static void pushGenericCommand(redisClient *c, int where) {
    robj *lobj;
    list *list;
A
antirez 已提交
4399 4400 4401

    lobj = lookupKeyWrite(c->db,c->argv[1]);
    if (lobj == NULL) {
4402
        if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4403
            addReply(c,shared.cone);
4404 4405
            return;
        }
A
antirez 已提交
4406 4407 4408
        lobj = createListObject();
        list = lobj->ptr;
        if (where == REDIS_HEAD) {
4409
            listAddNodeHead(list,c->argv[2]);
A
antirez 已提交
4410
        } else {
4411
            listAddNodeTail(list,c->argv[2]);
A
antirez 已提交
4412
        }
A
antirez 已提交
4413
        dictAdd(c->db->dict,c->argv[1],lobj);
A
antirez 已提交
4414 4415 4416 4417 4418 4419 4420
        incrRefCount(c->argv[1]);
        incrRefCount(c->argv[2]);
    } else {
        if (lobj->type != REDIS_LIST) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
4421
        if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
4422
            addReply(c,shared.cone);
4423 4424
            return;
        }
A
antirez 已提交
4425 4426
        list = lobj->ptr;
        if (where == REDIS_HEAD) {
4427
            listAddNodeHead(list,c->argv[2]);
A
antirez 已提交
4428
        } else {
4429
            listAddNodeTail(list,c->argv[2]);
A
antirez 已提交
4430 4431 4432 4433
        }
        incrRefCount(c->argv[2]);
    }
    server.dirty++;
4434
    addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
A
antirez 已提交
4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445
}

static void lpushCommand(redisClient *c) {
    pushGenericCommand(c,REDIS_HEAD);
}

static void rpushCommand(redisClient *c) {
    pushGenericCommand(c,REDIS_TAIL);
}

static void llenCommand(redisClient *c) {
A
antirez 已提交
4446
    robj *o;
A
antirez 已提交
4447
    list *l;
A
antirez 已提交
4448 4449 4450

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
A
Alex McHale 已提交
4451

A
antirez 已提交
4452 4453
    l = o->ptr;
    addReplyUlong(c,listLength(l));
A
antirez 已提交
4454 4455 4456
}

static void lindexCommand(redisClient *c) {
A
antirez 已提交
4457
    robj *o;
A
antirez 已提交
4458
    int index = atoi(c->argv[2]->ptr);
A
antirez 已提交
4459 4460 4461 4462 4463 4464 4465 4466 4467
    list *list;
    listNode *ln;

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
    list = o->ptr;

    ln = listIndex(list, index);
    if (ln == NULL) {
4468
        addReply(c,shared.nullbulk);
A
antirez 已提交
4469
    } else {
A
antirez 已提交
4470 4471
        robj *ele = listNodeValue(ln);
        addReplyBulk(c,ele);
A
antirez 已提交
4472 4473 4474 4475
    }
}

static void lsetCommand(redisClient *c) {
A
antirez 已提交
4476
    robj *o;
A
antirez 已提交
4477
    int index = atoi(c->argv[2]->ptr);
A
antirez 已提交
4478 4479 4480 4481 4482 4483 4484 4485 4486 4487
    list *list;
    listNode *ln;

    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
    list = o->ptr;

    ln = listIndex(list, index);
    if (ln == NULL) {
        addReply(c,shared.outofrangeerr);
A
antirez 已提交
4488
    } else {
A
antirez 已提交
4489
        robj *ele = listNodeValue(ln);
A
antirez 已提交
4490

A
antirez 已提交
4491 4492 4493 4494 4495
        decrRefCount(ele);
        listNodeValue(ln) = c->argv[3];
        incrRefCount(c->argv[3]);
        addReply(c,shared.ok);
        server.dirty++;
A
antirez 已提交
4496 4497 4498 4499
    }
}

static void popGenericCommand(redisClient *c, int where) {
A
antirez 已提交
4500
    robj *o;
A
antirez 已提交
4501 4502
    list *list;
    listNode *ln;
A
antirez 已提交
4503

A
antirez 已提交
4504 4505 4506
    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
    list = o->ptr;
A
antirez 已提交
4507

A
antirez 已提交
4508 4509 4510 4511
    if (where == REDIS_HEAD)
        ln = listFirst(list);
    else
        ln = listLast(list);
A
antirez 已提交
4512

A
antirez 已提交
4513 4514 4515 4516 4517 4518
    if (ln == NULL) {
        addReply(c,shared.nullbulk);
    } else {
        robj *ele = listNodeValue(ln);
        addReplyBulk(c,ele);
        listDelNode(list,ln);
4519
        if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4520
        server.dirty++;
A
antirez 已提交
4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532
    }
}

static void lpopCommand(redisClient *c) {
    popGenericCommand(c,REDIS_HEAD);
}

static void rpopCommand(redisClient *c) {
    popGenericCommand(c,REDIS_TAIL);
}

static void lrangeCommand(redisClient *c) {
A
antirez 已提交
4533
    robj *o;
A
antirez 已提交
4534 4535
    int start = atoi(c->argv[2]->ptr);
    int end = atoi(c->argv[3]->ptr);
A
antirez 已提交
4536 4537 4538 4539 4540 4541
    int llen;
    int rangelen, j;
    list *list;
    listNode *ln;
    robj *ele;

4542 4543
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
         || checkType(c,o,REDIS_LIST)) return;
A
antirez 已提交
4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560
    list = o->ptr;
    llen = listLength(list);

    /* convert negative indexes */
    if (start < 0) start = llen+start;
    if (end < 0) end = llen+end;
    if (start < 0) start = 0;
    if (end < 0) end = 0;

    /* indexes sanity checks */
    if (start > end || start >= llen) {
        /* Out of range start or start > end result in empty list */
        addReply(c,shared.emptymultibulk);
        return;
    }
    if (end >= llen) end = llen-1;
    rangelen = (end-start)+1;
A
antirez 已提交
4561

A
antirez 已提交
4562 4563 4564 4565 4566 4567 4568
    /* Return the result in form of a multi-bulk reply */
    ln = listIndex(list, start);
    addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
    for (j = 0; j < rangelen; j++) {
        ele = listNodeValue(ln);
        addReplyBulk(c,ele);
        ln = ln->next;
A
antirez 已提交
4569 4570 4571 4572
    }
}

static void ltrimCommand(redisClient *c) {
A
antirez 已提交
4573
    robj *o;
A
antirez 已提交
4574 4575
    int start = atoi(c->argv[2]->ptr);
    int end = atoi(c->argv[3]->ptr);
A
antirez 已提交
4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596
    int llen;
    int j, ltrim, rtrim;
    list *list;
    listNode *ln;

    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
    list = o->ptr;
    llen = listLength(list);

    /* convert negative indexes */
    if (start < 0) start = llen+start;
    if (end < 0) end = llen+end;
    if (start < 0) start = 0;
    if (end < 0) end = 0;

    /* indexes sanity checks */
    if (start > end || start >= llen) {
        /* Out of range start or start > end result in empty list */
        ltrim = llen;
        rtrim = 0;
A
antirez 已提交
4597
    } else {
A
antirez 已提交
4598 4599 4600 4601
        if (end >= llen) end = llen-1;
        ltrim = start;
        rtrim = llen-end-1;
    }
A
antirez 已提交
4602

A
antirez 已提交
4603 4604 4605 4606 4607 4608 4609 4610
    /* Remove list elements to perform the trim */
    for (j = 0; j < ltrim; j++) {
        ln = listFirst(list);
        listDelNode(list,ln);
    }
    for (j = 0; j < rtrim; j++) {
        ln = listLast(list);
        listDelNode(list,ln);
A
antirez 已提交
4611
    }
4612
    if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4613 4614
    server.dirty++;
    addReply(c,shared.ok);
A
antirez 已提交
4615 4616 4617
}

static void lremCommand(redisClient *c) {
A
antirez 已提交
4618
    robj *o;
A
antirez 已提交
4619 4620 4621 4622 4623
    list *list;
    listNode *ln, *next;
    int toremove = atoi(c->argv[2]->ptr);
    int removed = 0;
    int fromtail = 0;
A
antirez 已提交
4624

A
antirez 已提交
4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642
    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_LIST)) return;
    list = o->ptr;

    if (toremove < 0) {
        toremove = -toremove;
        fromtail = 1;
    }
    ln = fromtail ? list->tail : list->head;
    while (ln) {
        robj *ele = listNodeValue(ln);

        next = fromtail ? ln->prev : ln->next;
        if (compareStringObjects(ele,c->argv[3]) == 0) {
            listDelNode(list,ln);
            server.dirty++;
            removed++;
            if (toremove && removed == toremove) break;
A
antirez 已提交
4643
        }
A
antirez 已提交
4644
        ln = next;
A
antirez 已提交
4645
    }
4646
    if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4647
    addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
A
antirez 已提交
4648 4649
}

A
antirez 已提交
4650
/* This is the semantic of this command:
A
antirez 已提交
4651
 *  RPOPLPUSH srclist dstlist:
A
antirez 已提交
4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664
 *   IF LLEN(srclist) > 0
 *     element = RPOP srclist
 *     LPUSH dstlist element
 *     RETURN element
 *   ELSE
 *     RETURN nil
 *   END
 *  END
 *
 * The idea is to be able to get an element from a list in a reliable way
 * since the element is not just returned but pushed against another list
 * as well. This command was originally proposed by Ezra Zygmuntowicz.
 */
A
antirez 已提交
4665
static void rpoplpushcommand(redisClient *c) {
A
antirez 已提交
4666
    robj *sobj;
A
antirez 已提交
4667 4668 4669 4670 4671 4672 4673
    list *srclist;
    listNode *ln;

    if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,sobj,REDIS_LIST)) return;
    srclist = sobj->ptr;
    ln = listLast(srclist);
A
antirez 已提交
4674

A
antirez 已提交
4675
    if (ln == NULL) {
A
antirez 已提交
4676 4677
        addReply(c,shared.nullbulk);
    } else {
A
antirez 已提交
4678 4679 4680
        robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
        robj *ele = listNodeValue(ln);
        list *dstlist;
4681

A
antirez 已提交
4682 4683 4684 4685
        if (dobj && dobj->type != REDIS_LIST) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
A
antirez 已提交
4686

A
antirez 已提交
4687 4688 4689 4690 4691 4692 4693 4694
        /* Add the element to the target list (unless it's directly
         * passed to some BLPOP-ing client */
        if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
            if (dobj == NULL) {
                /* Create the list if the key does not exist */
                dobj = createListObject();
                dictAdd(c->db->dict,c->argv[2],dobj);
                incrRefCount(c->argv[2]);
A
antirez 已提交
4695
            }
A
antirez 已提交
4696 4697 4698
            dstlist = dobj->ptr;
            listAddNodeHead(dstlist,ele);
            incrRefCount(ele);
A
antirez 已提交
4699
        }
A
antirez 已提交
4700 4701 4702 4703 4704 4705

        /* Send the element to the client as reply as well */
        addReplyBulk(c,ele);

        /* Finally remove the element from the source list */
        listDelNode(srclist,ln);
4706
        if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4707
        server.dirty++;
A
antirez 已提交
4708 4709 4710
    }
}

A
antirez 已提交
4711 4712 4713 4714 4715
/* ==================================== Sets ================================ */

static void saddCommand(redisClient *c) {
    robj *set;

A
antirez 已提交
4716 4717
    set = lookupKeyWrite(c->db,c->argv[1]);
    if (set == NULL) {
A
antirez 已提交
4718
        set = createSetObject();
A
antirez 已提交
4719
        dictAdd(c->db->dict,c->argv[1],set);
A
antirez 已提交
4720 4721 4722
        incrRefCount(c->argv[1]);
    } else {
        if (set->type != REDIS_SET) {
4723
            addReply(c,shared.wrongtypeerr);
A
antirez 已提交
4724 4725 4726 4727 4728 4729
            return;
        }
    }
    if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
        incrRefCount(c->argv[2]);
        server.dirty++;
4730
        addReply(c,shared.cone);
A
antirez 已提交
4731
    } else {
4732
        addReply(c,shared.czero);
A
antirez 已提交
4733 4734 4735 4736
    }
}

static void sremCommand(redisClient *c) {
A
antirez 已提交
4737
    robj *set;
A
antirez 已提交
4738

A
antirez 已提交
4739 4740 4741 4742 4743 4744
    if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,set,REDIS_SET)) return;

    if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
        server.dirty++;
        if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4745
        if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4746
        addReply(c,shared.cone);
A
antirez 已提交
4747
    } else {
A
antirez 已提交
4748
        addReply(c,shared.czero);
A
antirez 已提交
4749 4750 4751
    }
}

A
antirez 已提交
4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774
static void smoveCommand(redisClient *c) {
    robj *srcset, *dstset;

    srcset = lookupKeyWrite(c->db,c->argv[1]);
    dstset = lookupKeyWrite(c->db,c->argv[2]);

    /* If the source key does not exist return 0, if it's of the wrong type
     * raise an error */
    if (srcset == NULL || srcset->type != REDIS_SET) {
        addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
        return;
    }
    /* Error if the destination key is not a set as well */
    if (dstset && dstset->type != REDIS_SET) {
        addReply(c,shared.wrongtypeerr);
        return;
    }
    /* Remove the element from the source set */
    if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
        /* Key not found in the src set! return zero */
        addReply(c,shared.czero);
        return;
    }
4775 4776
    if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
        deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788
    server.dirty++;
    /* Add the element to the destination set */
    if (!dstset) {
        dstset = createSetObject();
        dictAdd(c->db->dict,c->argv[2],dstset);
        incrRefCount(c->argv[2]);
    }
    if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
        incrRefCount(c->argv[3]);
    addReply(c,shared.cone);
}

A
antirez 已提交
4789
static void sismemberCommand(redisClient *c) {
A
antirez 已提交
4790
    robj *set;
A
antirez 已提交
4791

A
antirez 已提交
4792 4793 4794 4795 4796 4797
    if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,set,REDIS_SET)) return;

    if (dictFind(set->ptr,c->argv[2]))
        addReply(c,shared.cone);
    else
4798
        addReply(c,shared.czero);
A
antirez 已提交
4799 4800 4801
}

static void scardCommand(redisClient *c) {
A
antirez 已提交
4802
    robj *o;
A
antirez 已提交
4803
    dict *s;
A
antirez 已提交
4804 4805 4806

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_SET)) return;
A
Alex McHale 已提交
4807

A
antirez 已提交
4808 4809
    s = o->ptr;
    addReplyUlong(c,dictSize(s));
A
antirez 已提交
4810 4811
}

4812 4813 4814 4815
static void spopCommand(redisClient *c) {
    robj *set;
    dictEntry *de;

A
antirez 已提交
4816 4817 4818 4819 4820
    if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,set,REDIS_SET)) return;

    de = dictGetRandomKey(set->ptr);
    if (de == NULL) {
4821 4822
        addReply(c,shared.nullbulk);
    } else {
A
antirez 已提交
4823
        robj *ele = dictGetEntryKey(de);
4824

A
antirez 已提交
4825 4826 4827
        addReplyBulk(c,ele);
        dictDelete(set->ptr,ele);
        if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4828
        if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
4829
        server.dirty++;
4830 4831 4832
    }
}

A
antirez 已提交
4833 4834 4835 4836
static void srandmemberCommand(redisClient *c) {
    robj *set;
    dictEntry *de;

A
antirez 已提交
4837 4838 4839 4840 4841
    if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,set,REDIS_SET)) return;

    de = dictGetRandomKey(set->ptr);
    if (de == NULL) {
A
antirez 已提交
4842 4843
        addReply(c,shared.nullbulk);
    } else {
A
antirez 已提交
4844
        robj *ele = dictGetEntryKey(de);
A
antirez 已提交
4845

A
antirez 已提交
4846
        addReplyBulk(c,ele);
A
antirez 已提交
4847 4848 4849
    }
}

A
antirez 已提交
4850 4851 4852
static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
    dict **d1 = (void*) s1, **d2 = (void*) s2;

A
antirez 已提交
4853
    return dictSize(*d1)-dictSize(*d2);
A
antirez 已提交
4854 4855
}

4856
static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
A
antirez 已提交
4857 4858 4859 4860
    dict **dv = zmalloc(sizeof(dict*)*setsnum);
    dictIterator *di;
    dictEntry *de;
    robj *lenobj = NULL, *dstset = NULL;
4861
    unsigned long j, cardinality = 0;
A
antirez 已提交
4862 4863 4864

    for (j = 0; j < setsnum; j++) {
        robj *setobj;
A
antirez 已提交
4865 4866 4867 4868 4869

        setobj = dstkey ?
                    lookupKeyWrite(c->db,setskeys[j]) :
                    lookupKeyRead(c->db,setskeys[j]);
        if (!setobj) {
A
antirez 已提交
4870
            zfree(dv);
4871
            if (dstkey) {
4872 4873
                if (deleteKey(c->db,dstkey))
                    server.dirty++;
A
antirez 已提交
4874
                addReply(c,shared.czero);
4875
            } else {
4876
                addReply(c,shared.emptymultibulk);
4877
            }
A
antirez 已提交
4878 4879 4880 4881
            return;
        }
        if (setobj->type != REDIS_SET) {
            zfree(dv);
4882
            addReply(c,shared.wrongtypeerr);
A
antirez 已提交
4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919
            return;
        }
        dv[j] = setobj->ptr;
    }
    /* Sort sets from the smallest to largest, this will improve our
     * algorithm's performace */
    qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);

    /* The first thing we should output is the total number of elements...
     * since this is a multi-bulk write, but at this stage we don't know
     * the intersection set size, so we use a trick, append an empty object
     * to the output list and save the pointer to later modify it with the
     * right length */
    if (!dstkey) {
        lenobj = createObject(REDIS_STRING,NULL);
        addReply(c,lenobj);
        decrRefCount(lenobj);
    } else {
        /* If we have a target key where to store the resulting set
         * create this key with an empty set inside */
        dstset = createSetObject();
    }

    /* Iterate all the elements of the first (smallest) set, and test
     * the element against all the other sets, if at least one set does
     * not include the element it is discarded */
    di = dictGetIterator(dv[0]);

    while((de = dictNext(di)) != NULL) {
        robj *ele;

        for (j = 1; j < setsnum; j++)
            if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
        if (j != setsnum)
            continue; /* at least one set does not contain the member */
        ele = dictGetEntryKey(de);
        if (!dstkey) {
A
antirez 已提交
4920
            addReplyBulk(c,ele);
A
antirez 已提交
4921 4922 4923 4924 4925 4926 4927 4928
            cardinality++;
        } else {
            dictAdd(dstset->ptr,ele,NULL);
            incrRefCount(ele);
        }
    }
    dictReleaseIterator(di);

4929
    if (dstkey) {
4930 4931
        /* Store the resulting set into the target, if the intersection
         * is not an empty set. */
4932
        deleteKey(c->db,dstkey);
4933 4934 4935
        if (dictSize((dict*)dstset->ptr) > 0) {
            dictAdd(c->db->dict,dstkey,dstset);
            incrRefCount(dstkey);
4936
            addReplyLong(c,dictSize((dict*)dstset->ptr));
4937 4938
        } else {
            decrRefCount(dstset);
4939
            addReply(c,shared.czero);
4940
        }
4941
        server.dirty++;
4942 4943
    } else {
        lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
4944
    }
A
antirez 已提交
4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955
    zfree(dv);
}

static void sinterCommand(redisClient *c) {
    sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
}

static void sinterstoreCommand(redisClient *c) {
    sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
}

4956 4957
#define REDIS_OP_UNION 0
#define REDIS_OP_DIFF 1
4958
#define REDIS_OP_INTER 2
4959 4960

static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
4961 4962 4963
    dict **dv = zmalloc(sizeof(dict*)*setsnum);
    dictIterator *di;
    dictEntry *de;
4964
    robj *dstset = NULL;
4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992
    int j, cardinality = 0;

    for (j = 0; j < setsnum; j++) {
        robj *setobj;

        setobj = dstkey ?
                    lookupKeyWrite(c->db,setskeys[j]) :
                    lookupKeyRead(c->db,setskeys[j]);
        if (!setobj) {
            dv[j] = NULL;
            continue;
        }
        if (setobj->type != REDIS_SET) {
            zfree(dv);
            addReply(c,shared.wrongtypeerr);
            return;
        }
        dv[j] = setobj->ptr;
    }

    /* We need a temp set object to store our union. If the dstkey
     * is not NULL (that is, we are inside an SUNIONSTORE operation) then
     * this set object will be the resulting object to set into the target key*/
    dstset = createSetObject();

    /* Iterate all the elements of all the sets, add every element a single
     * time to the result set */
    for (j = 0; j < setsnum; j++) {
4993
        if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
4994 4995 4996 4997 4998 4999 5000 5001 5002
        if (!dv[j]) continue; /* non existing keys are like empty sets */

        di = dictGetIterator(dv[j]);

        while((de = dictNext(di)) != NULL) {
            robj *ele;

            /* dictAdd will not add the same element multiple times */
            ele = dictGetEntryKey(de);
5003 5004 5005
            if (op == REDIS_OP_UNION || j == 0) {
                if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
                    incrRefCount(ele);
5006 5007
                    cardinality++;
                }
5008 5009 5010 5011
            } else if (op == REDIS_OP_DIFF) {
                if (dictDelete(dstset->ptr,ele) == DICT_OK) {
                    cardinality--;
                }
5012 5013 5014
            }
        }
        dictReleaseIterator(di);
5015

5016 5017
        /* result set is empty? Exit asap. */
        if (op == REDIS_OP_DIFF && cardinality == 0) break;
5018 5019
    }

5020 5021 5022 5023 5024 5025 5026 5027
    /* Output the content of the resulting set, if not in STORE mode */
    if (!dstkey) {
        addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
        di = dictGetIterator(dstset->ptr);
        while((de = dictNext(di)) != NULL) {
            robj *ele;

            ele = dictGetEntryKey(de);
A
antirez 已提交
5028
            addReplyBulk(c,ele);
5029 5030
        }
        dictReleaseIterator(di);
5031
        decrRefCount(dstset);
5032 5033 5034 5035
    } else {
        /* If we have a target key where to store the resulting set
         * create this key with the result set inside */
        deleteKey(c->db,dstkey);
5036 5037 5038
        if (dictSize((dict*)dstset->ptr) > 0) {
            dictAdd(c->db->dict,dstkey,dstset);
            incrRefCount(dstkey);
5039
            addReplyLong(c,dictSize((dict*)dstset->ptr));
5040 5041
        } else {
            decrRefCount(dstset);
5042
            addReply(c,shared.czero);
5043
        }
5044 5045 5046 5047 5048 5049
        server.dirty++;
    }
    zfree(dv);
}

static void sunionCommand(redisClient *c) {
5050
    sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
5051 5052 5053
}

static void sunionstoreCommand(redisClient *c) {
5054 5055 5056 5057 5058 5059 5060 5061 5062
    sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
}

static void sdiffCommand(redisClient *c) {
    sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
}

static void sdiffstoreCommand(redisClient *c) {
    sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
5063 5064
}

5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087
/* ==================================== ZSets =============================== */

/* ZSETs are ordered sets using two data structures to hold the same elements
 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
 * data structure.
 *
 * The elements are added to an hash table mapping Redis objects to scores.
 * At the same time the elements are added to a skip list mapping scores
 * to Redis objects (so objects are sorted by scores in this "view"). */

/* This skiplist implementation is almost a C translation of the original
 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
 * Alternative to Balanced Trees", modified in three ways:
 * a) this implementation allows for repeated values.
 * b) the comparison is not just by key (our 'score') but by satellite data.
 * c) there is a back pointer, so it's a doubly linked list with the back
 * pointers being only at "level 1". This allows to traverse the list
 * from tail to head, useful for ZREVRANGE. */

static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
    zskiplistNode *zn = zmalloc(sizeof(*zn));

    zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
5088 5089
    if (level > 0)
        zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
5090 5091 5092 5093 5094 5095 5096 5097
    zn->score = score;
    zn->obj = obj;
    return zn;
}

static zskiplist *zslCreate(void) {
    int j;
    zskiplist *zsl;
A
Alex McHale 已提交
5098

5099 5100
    zsl = zmalloc(sizeof(*zsl));
    zsl->level = 1;
5101
    zsl->length = 0;
5102
    zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
5103
    for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
5104
        zsl->header->forward[j] = NULL;
5105 5106 5107 5108

        /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
        if (j < ZSKIPLIST_MAXLEVEL-1)
            zsl->header->span[j] = 0;
5109
    }
5110 5111
    zsl->header->backward = NULL;
    zsl->tail = NULL;
5112 5113 5114
    return zsl;
}

5115 5116
static void zslFreeNode(zskiplistNode *node) {
    decrRefCount(node->obj);
5117
    zfree(node->forward);
5118
    zfree(node->span);
5119 5120 5121 5122
    zfree(node);
}

static void zslFree(zskiplist *zsl) {
5123
    zskiplistNode *node = zsl->header->forward[0], *next;
5124

5125
    zfree(zsl->header->forward);
5126
    zfree(zsl->header->span);
5127
    zfree(zsl->header);
5128
    while(node) {
5129
        next = node->forward[0];
5130 5131 5132
        zslFreeNode(node);
        node = next;
    }
5133
    zfree(zsl);
5134 5135
}

5136 5137 5138 5139
static int zslRandomLevel(void) {
    int level = 1;
    while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
        level += 1;
A
antirez 已提交
5140
    return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
5141 5142 5143 5144
}

static void zslInsert(zskiplist *zsl, double score, robj *obj) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5145
    unsigned int rank[ZSKIPLIST_MAXLEVEL];
5146 5147 5148 5149
    int i, level;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
5150 5151
        /* store rank that is crossed to reach the insert position */
        rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
5152

5153 5154 5155
        while (x->forward[i] &&
            (x->forward[i]->score < score ||
                (x->forward[i]->score == score &&
5156
                compareStringObjects(x->forward[i]->obj,obj) < 0))) {
P
Pieter Noordhuis 已提交
5157
            rank[i] += i > 0 ? x->span[i-1] : 1;
5158
            x = x->forward[i];
5159
        }
5160 5161 5162 5163 5164 5165 5166 5167
        update[i] = x;
    }
    /* we assume the key is not already inside, since we allow duplicated
     * scores, and the re-insertion of score and redis object should never
     * happpen since the caller of zslInsert() should test in the hash table
     * if the element is already inside or not. */
    level = zslRandomLevel();
    if (level > zsl->level) {
5168
        for (i = zsl->level; i < level; i++) {
5169
            rank[i] = 0;
5170
            update[i] = zsl->header;
5171
            update[i]->span[i-1] = zsl->length;
5172
        }
5173 5174 5175 5176 5177 5178
        zsl->level = level;
    }
    x = zslCreateNode(level,score,obj);
    for (i = 0; i < level; i++) {
        x->forward[i] = update[i]->forward[i];
        update[i]->forward[i] = x;
5179 5180

        /* update span covered by update[i] as x is inserted here */
5181 5182 5183 5184
        if (i > 0) {
            x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
            update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
        }
5185
    }
5186 5187 5188

    /* increment span for untouched levels */
    for (i = level; i < zsl->level; i++) {
5189
        update[i]->span[i-1]++;
5190 5191
    }

A
antirez 已提交
5192
    x->backward = (update[0] == zsl->header) ? NULL : update[0];
5193 5194 5195 5196
    if (x->forward[0])
        x->forward[0]->backward = x;
    else
        zsl->tail = x;
5197
    zsl->length++;
5198 5199
}

5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224
/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
    int i;
    for (i = 0; i < zsl->level; i++) {
        if (update[i]->forward[i] == x) {
            if (i > 0) {
                update[i]->span[i-1] += x->span[i-1] - 1;
            }
            update[i]->forward[i] = x->forward[i];
        } else {
            /* invariant: i > 0, because update[0]->forward[0]
             * is always equal to x */
            update[i]->span[i-1] -= 1;
        }
    }
    if (x->forward[0]) {
        x->forward[0]->backward = x->backward;
    } else {
        zsl->tail = x->backward;
    }
    while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
        zsl->level--;
    zsl->length--;
}

5225
/* Delete an element with matching score/object from the skiplist. */
5226
static int zslDelete(zskiplist *zsl, double score, robj *obj) {
5227 5228 5229 5230 5231
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
5232 5233 5234 5235
        while (x->forward[i] &&
            (x->forward[i]->score < score ||
                (x->forward[i]->score == score &&
                compareStringObjects(x->forward[i]->obj,obj) < 0)))
5236 5237 5238 5239 5240 5241
            x = x->forward[i];
        update[i] = x;
    }
    /* We may have multiple elements with the same score, what we need
     * is to find the element with both the right score and object. */
    x = x->forward[0];
5242
    if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
5243
        zslDeleteNode(zsl, x, update);
5244 5245 5246 5247
        zslFreeNode(x);
        return 1;
    } else {
        return 0; /* not found */
5248 5249
    }
    return 0; /* not found */
5250 5251
}

5252 5253 5254 5255
/* Delete all the elements with score between min and max from the skiplist.
 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
 * Note that this function takes the reference to the hash table view of the
 * sorted set, in order to remove the elements from the hash table too. */
5256
static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    unsigned long removed = 0;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->forward[i] && x->forward[i]->score < min)
            x = x->forward[i];
        update[i] = x;
    }
    /* We may have multiple elements with the same score, what we need
     * is to find the element with both the right score and object. */
    x = x->forward[0];
    while (x && x->score <= max) {
5271 5272
        zskiplistNode *next = x->forward[0];
        zslDeleteNode(zsl, x, update);
5273 5274 5275 5276 5277 5278 5279 5280
        dictDelete(dict,x->obj);
        zslFreeNode(x);
        removed++;
        x = next;
    }
    return removed; /* not found */
}

P
Pieter Noordhuis 已提交
5281
/* Delete all the elements with rank between start and end from the skiplist.
5282
 * Start and end are inclusive. Note that start and end need to be 1-based */
P
Pieter Noordhuis 已提交
5283 5284 5285 5286 5287 5288 5289 5290 5291 5292
static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    unsigned long traversed = 0, removed = 0;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
            traversed += i > 0 ? x->span[i-1] : 1;
            x = x->forward[i];
5293
        }
P
Pieter Noordhuis 已提交
5294 5295 5296 5297 5298 5299
        update[i] = x;
    }

    traversed++;
    x = x->forward[0];
    while (x && traversed <= end) {
5300 5301
        zskiplistNode *next = x->forward[0];
        zslDeleteNode(zsl, x, update);
5302 5303 5304
        dictDelete(dict,x->obj);
        zslFreeNode(x);
        removed++;
P
Pieter Noordhuis 已提交
5305
        traversed++;
5306 5307
        x = next;
    }
P
Pieter Noordhuis 已提交
5308
    return removed;
5309 5310
}

5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326
/* Find the first node having a score equal or greater than the specified one.
 * Returns NULL if there is no match. */
static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
    zskiplistNode *x;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->forward[i] && x->forward[i]->score < score)
            x = x->forward[i];
    }
    /* We may have multiple elements with the same score, what we need
     * is to find the element with both the right score and object. */
    return x->forward[0];
}

5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341
/* Find the rank for an element by both score and key.
 * Returns 0 when the element cannot be found, rank otherwise.
 * Note that the rank is 1-based due to the span of zsl->header to the
 * first element. */
static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
    zskiplistNode *x;
    unsigned long rank = 0;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->forward[i] &&
            (x->forward[i]->score < score ||
                (x->forward[i]->score == score &&
                compareStringObjects(x->forward[i]->obj,o) <= 0))) {
P
Pieter Noordhuis 已提交
5342
            rank += i > 0 ? x->span[i-1] : 1;
5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353
            x = x->forward[i];
        }

        /* x might be equal to zsl->header, so test if obj is non-NULL */
        if (x->obj && compareStringObjects(x->obj,o) == 0) {
            return rank;
        }
    }
    return 0;
}

5354 5355 5356 5357 5358 5359 5360 5361
/* Finds an element by its rank. The rank argument needs to be 1-based. */
zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
    zskiplistNode *x;
    unsigned long traversed = 0;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
A
antirez 已提交
5362 5363
        while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
        {
P
Pieter Noordhuis 已提交
5364
            traversed += i > 0 ? x->span[i-1] : 1;
5365 5366 5367 5368 5369 5370 5371 5372 5373
            x = x->forward[i];
        }
        if (traversed == rank) {
            return x;
        }
    }
    return NULL;
}

5374 5375
/* The actual Z-commands implementations */

A
antirez 已提交
5376
/* This generic command implements both ZADD and ZINCRBY.
A
antirez 已提交
5377
 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
A
antirez 已提交
5378
 * the increment if the operation is a ZINCRBY (doincrement == 1). */
A
antirez 已提交
5379
static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
5380 5381 5382 5383
    robj *zsetobj;
    zset *zs;
    double *score;

A
antirez 已提交
5384
    zsetobj = lookupKeyWrite(c->db,key);
5385 5386
    if (zsetobj == NULL) {
        zsetobj = createZsetObject();
A
antirez 已提交
5387 5388
        dictAdd(c->db->dict,key,zsetobj);
        incrRefCount(key);
5389 5390 5391 5392 5393 5394 5395
    } else {
        if (zsetobj->type != REDIS_ZSET) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
    }
    zs = zsetobj->ptr;
A
antirez 已提交
5396

A
antirez 已提交
5397
    /* Ok now since we implement both ZADD and ZINCRBY here the code
A
antirez 已提交
5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416
     * needs to handle the two different conditions. It's all about setting
     * '*score', that is, the new score to set, to the right value. */
    score = zmalloc(sizeof(double));
    if (doincrement) {
        dictEntry *de;

        /* Read the old score. If the element was not present starts from 0 */
        de = dictFind(zs->dict,ele);
        if (de) {
            double *oldscore = dictGetEntryVal(de);
            *score = *oldscore + scoreval;
        } else {
            *score = scoreval;
        }
    } else {
        *score = scoreval;
    }

    /* What follows is a simple remove and re-insert operation that is common
A
antirez 已提交
5417
     * to both ZADD and ZINCRBY... */
A
antirez 已提交
5418
    if (dictAdd(zs->dict,ele,score) == DICT_OK) {
5419
        /* case 1: New element */
A
antirez 已提交
5420 5421 5422
        incrRefCount(ele); /* added to hash */
        zslInsert(zs->zsl,*score,ele);
        incrRefCount(ele); /* added to skiplist */
5423
        server.dirty++;
A
antirez 已提交
5424 5425
        if (doincrement)
            addReplyDouble(c,*score);
A
antirez 已提交
5426 5427
        else
            addReply(c,shared.cone);
5428 5429 5430
    } else {
        dictEntry *de;
        double *oldscore;
A
Alex McHale 已提交
5431

5432
        /* case 2: Score update operation */
A
antirez 已提交
5433
        de = dictFind(zs->dict,ele);
5434
        redisAssert(de != NULL);
5435 5436 5437 5438
        oldscore = dictGetEntryVal(de);
        if (*score != *oldscore) {
            int deleted;

A
antirez 已提交
5439 5440
            /* Remove and insert the element in the skip list with new score */
            deleted = zslDelete(zs->zsl,*oldscore,ele);
5441
            redisAssert(deleted != 0);
A
antirez 已提交
5442 5443 5444 5445
            zslInsert(zs->zsl,*score,ele);
            incrRefCount(ele);
            /* Update the score in the hash table */
            dictReplace(zs->dict,ele,score);
5446
            server.dirty++;
5447 5448
        } else {
            zfree(score);
5449
        }
A
antirez 已提交
5450 5451 5452 5453
        if (doincrement)
            addReplyDouble(c,*score);
        else
            addReply(c,shared.czero);
5454 5455 5456
    }
}

A
antirez 已提交
5457 5458 5459
static void zaddCommand(redisClient *c) {
    double scoreval;

A
Alex McHale 已提交
5460 5461
    if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;

A
antirez 已提交
5462 5463 5464
    zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
}

A
antirez 已提交
5465
static void zincrbyCommand(redisClient *c) {
A
antirez 已提交
5466 5467
    double scoreval;

A
Alex McHale 已提交
5468 5469
    if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;

A
antirez 已提交
5470 5471 5472
    zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
}

A
antirez 已提交
5473 5474 5475
static void zremCommand(redisClient *c) {
    robj *zsetobj;
    zset *zs;
A
antirez 已提交
5476 5477 5478
    dictEntry *de;
    double *oldscore;
    int deleted;
A
antirez 已提交
5479

A
antirez 已提交
5480 5481
    if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,zsetobj,REDIS_ZSET)) return;
A
antirez 已提交
5482

A
antirez 已提交
5483 5484 5485 5486 5487
    zs = zsetobj->ptr;
    de = dictFind(zs->dict,c->argv[2]);
    if (de == NULL) {
        addReply(c,shared.czero);
        return;
A
antirez 已提交
5488
    }
A
antirez 已提交
5489 5490 5491 5492 5493 5494 5495 5496
    /* Delete from the skiplist */
    oldscore = dictGetEntryVal(de);
    deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
    redisAssert(deleted != 0);

    /* Delete from the hash table */
    dictDelete(zs->dict,c->argv[2]);
    if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5497
    if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
5498 5499
    server.dirty++;
    addReply(c,shared.cone);
A
antirez 已提交
5500 5501
}

5502
static void zremrangebyscoreCommand(redisClient *c) {
A
Alex McHale 已提交
5503 5504
    double min;
    double max;
A
antirez 已提交
5505
    long deleted;
5506 5507 5508
    robj *zsetobj;
    zset *zs;

A
Alex McHale 已提交
5509 5510 5511
    if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
        (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;

A
antirez 已提交
5512 5513
    if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,zsetobj,REDIS_ZSET)) return;
5514

A
antirez 已提交
5515 5516 5517
    zs = zsetobj->ptr;
    deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
    if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5518
    if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
5519 5520
    server.dirty += deleted;
    addReplyLong(c,deleted);
5521 5522
}

P
Pieter Noordhuis 已提交
5523
static void zremrangebyrankCommand(redisClient *c) {
A
Alex McHale 已提交
5524 5525
    long start;
    long end;
A
antirez 已提交
5526 5527
    int llen;
    long deleted;
P
Pieter Noordhuis 已提交
5528 5529 5530
    robj *zsetobj;
    zset *zs;

A
Alex McHale 已提交
5531 5532 5533
    if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
        (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;

A
antirez 已提交
5534 5535 5536 5537
    if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,zsetobj,REDIS_ZSET)) return;
    zs = zsetobj->ptr;
    llen = zs->zsl->length;
P
Pieter Noordhuis 已提交
5538

A
antirez 已提交
5539 5540 5541 5542 5543
    /* convert negative indexes */
    if (start < 0) start = llen+start;
    if (end < 0) end = llen+end;
    if (start < 0) start = 0;
    if (end < 0) end = 0;
P
Pieter Noordhuis 已提交
5544

A
antirez 已提交
5545 5546 5547 5548
    /* indexes sanity checks */
    if (start > end || start >= llen) {
        addReply(c,shared.czero);
        return;
P
Pieter Noordhuis 已提交
5549
    }
A
antirez 已提交
5550 5551 5552 5553 5554 5555
    if (end >= llen) end = llen-1;

    /* increment start and end because zsl*Rank functions
     * use 1-based rank */
    deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
    if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5556
    if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
A
antirez 已提交
5557 5558
    server.dirty += deleted;
    addReplyLong(c, deleted);
P
Pieter Noordhuis 已提交
5559 5560
}

5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573
typedef struct {
    dict *dict;
    double weight;
} zsetopsrc;

static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
    zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
    unsigned long size1, size2;
    size1 = d1->dict ? dictSize(d1->dict) : 0;
    size2 = d2->dict ? dictSize(d2->dict) : 0;
    return size1 - size2;
}

5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590
#define REDIS_AGGR_SUM 1
#define REDIS_AGGR_MIN 2
#define REDIS_AGGR_MAX 3

inline static void zunionInterAggregate(double *target, double val, int aggregate) {
    if (aggregate == REDIS_AGGR_SUM) {
        *target = *target + val;
    } else if (aggregate == REDIS_AGGR_MIN) {
        *target = val < *target ? val : *target;
    } else if (aggregate == REDIS_AGGR_MAX) {
        *target = val > *target ? val : *target;
    } else {
        /* safety net */
        redisAssert(0 != 0);
    }
}

5591
static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
5592
    int i, j, zsetnum;
5593
    int aggregate = REDIS_AGGR_SUM;
5594
    zsetopsrc *src;
5595 5596
    robj *dstobj;
    zset *dstzset;
P
Pieter Noordhuis 已提交
5597 5598 5599
    dictIterator *di;
    dictEntry *de;

5600 5601 5602 5603 5604
    /* expect zsetnum input keys to be given */
    zsetnum = atoi(c->argv[2]->ptr);
    if (zsetnum < 1) {
        addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
        return;
P
Pieter Noordhuis 已提交
5605
    }
5606 5607 5608

    /* test if the expected number of keys would overflow */
    if (3+zsetnum > c->argc) {
P
Pieter Noordhuis 已提交
5609 5610 5611 5612
        addReply(c,shared.syntaxerr);
        return;
    }

5613
    /* read keys to be used for input */
5614
    src = zmalloc(sizeof(zsetopsrc) * zsetnum);
5615
    for (i = 0, j = 3; i < zsetnum; i++, j++) {
P
Pieter Noordhuis 已提交
5616 5617
        robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
        if (!zsetobj) {
5618
            src[i].dict = NULL;
P
Pieter Noordhuis 已提交
5619 5620
        } else {
            if (zsetobj->type != REDIS_ZSET) {
5621
                zfree(src);
P
Pieter Noordhuis 已提交
5622 5623 5624
                addReply(c,shared.wrongtypeerr);
                return;
            }
5625
            src[i].dict = ((zset*)zsetobj->ptr)->dict;
P
Pieter Noordhuis 已提交
5626
        }
5627 5628

        /* default all weights to 1 */
5629
        src[i].weight = 1.0;
P
Pieter Noordhuis 已提交
5630 5631
    }

5632 5633
    /* parse optional extra arguments */
    if (j < c->argc) {
5634
        int remaining = c->argc - j;
P
Pieter Noordhuis 已提交
5635

5636
        while (remaining) {
5637
            if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
5638 5639
                j++; remaining--;
                for (i = 0; i < zsetnum; i++, j++, remaining--) {
A
Alex McHale 已提交
5640 5641
                    if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
                        return;
5642
                }
5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656
            } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
                j++; remaining--;
                if (!strcasecmp(c->argv[j]->ptr,"sum")) {
                    aggregate = REDIS_AGGR_SUM;
                } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
                    aggregate = REDIS_AGGR_MIN;
                } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
                    aggregate = REDIS_AGGR_MAX;
                } else {
                    zfree(src);
                    addReply(c,shared.syntaxerr);
                    return;
                }
                j++; remaining--;
5657
            } else {
5658
                zfree(src);
5659 5660 5661 5662 5663
                addReply(c,shared.syntaxerr);
                return;
            }
        }
    }
P
Pieter Noordhuis 已提交
5664

5665 5666 5667 5668
    /* sort sets from the smallest to largest, this will improve our
     * algorithm's performance */
    qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);

5669 5670 5671 5672
    dstobj = createZsetObject();
    dstzset = dstobj->ptr;

    if (op == REDIS_OP_INTER) {
5673 5674 5675 5676 5677
        /* skip going over all entries if the smallest zset is NULL or empty */
        if (src[0].dict && dictSize(src[0].dict) > 0) {
            /* precondition: as src[0].dict is non-empty and the zsets are ordered
             * from small to large, all src[i > 0].dict are non-empty too */
            di = dictGetIterator(src[0].dict);
5678
            while((de = dictNext(di)) != NULL) {
5679 5680
                double *score = zmalloc(sizeof(double)), value;
                *score = src[0].weight * (*(double*)dictGetEntryVal(de));
5681

5682 5683
                for (j = 1; j < zsetnum; j++) {
                    dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5684
                    if (other) {
5685 5686
                        value = src[j].weight * (*(double*)dictGetEntryVal(other));
                        zunionInterAggregate(score, value, aggregate);
5687 5688 5689 5690
                    } else {
                        break;
                    }
                }
P
Pieter Noordhuis 已提交
5691

5692
                /* skip entry when not present in every source dict */
5693
                if (j != zsetnum) {
5694 5695 5696 5697 5698 5699 5700
                    zfree(score);
                } else {
                    robj *o = dictGetEntryKey(de);
                    dictAdd(dstzset->dict,o,score);
                    incrRefCount(o); /* added to dictionary */
                    zslInsert(dstzset->zsl,*score,o);
                    incrRefCount(o); /* added to skiplist */
P
Pieter Noordhuis 已提交
5701 5702
                }
            }
5703 5704 5705 5706
            dictReleaseIterator(di);
        }
    } else if (op == REDIS_OP_UNION) {
        for (i = 0; i < zsetnum; i++) {
5707
            if (!src[i].dict) continue;
5708

5709
            di = dictGetIterator(src[i].dict);
5710 5711 5712 5713
            while((de = dictNext(di)) != NULL) {
                /* skip key when already processed */
                if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;

5714 5715
                double *score = zmalloc(sizeof(double)), value;
                *score = src[i].weight * (*(double*)dictGetEntryVal(de));
5716

5717 5718 5719 5720
                /* because the zsets are sorted by size, its only possible
                 * for sets at larger indices to hold this entry */
                for (j = (i+1); j < zsetnum; j++) {
                    dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
5721
                    if (other) {
5722 5723
                        value = src[j].weight * (*(double*)dictGetEntryVal(other));
                        zunionInterAggregate(score, value, aggregate);
5724 5725
                    }
                }
P
Pieter Noordhuis 已提交
5726

5727 5728 5729 5730 5731 5732 5733
                robj *o = dictGetEntryKey(de);
                dictAdd(dstzset->dict,o,score);
                incrRefCount(o); /* added to dictionary */
                zslInsert(dstzset->zsl,*score,o);
                incrRefCount(o); /* added to skiplist */
            }
            dictReleaseIterator(di);
P
Pieter Noordhuis 已提交
5734
        }
5735 5736 5737
    } else {
        /* unknown operator */
        redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
P
Pieter Noordhuis 已提交
5738 5739 5740
    }

    deleteKey(c->db,dstkey);
5741 5742 5743 5744 5745 5746
    if (dstzset->zsl->length) {
        dictAdd(c->db->dict,dstkey,dstobj);
        incrRefCount(dstkey);
        addReplyLong(c, dstzset->zsl->length);
        server.dirty++;
    } else {
5747
        decrRefCount(dstobj);
5748 5749
        addReply(c, shared.czero);
    }
5750
    zfree(src);
P
Pieter Noordhuis 已提交
5751 5752
}

5753 5754
static void zunionCommand(redisClient *c) {
    zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
P
Pieter Noordhuis 已提交
5755 5756
}

5757 5758
static void zinterCommand(redisClient *c) {
    zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
P
Pieter Noordhuis 已提交
5759 5760
}

5761
static void zrangeGenericCommand(redisClient *c, int reverse) {
5762
    robj *o;
A
Alex McHale 已提交
5763 5764
    long start;
    long end;
5765
    int withscores = 0;
A
antirez 已提交
5766 5767 5768 5769 5770 5771
    int llen;
    int rangelen, j;
    zset *zsetobj;
    zskiplist *zsl;
    zskiplistNode *ln;
    robj *ele;
5772

A
Alex McHale 已提交
5773 5774 5775
    if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
        (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;

5776 5777 5778 5779 5780 5781
    if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
        withscores = 1;
    } else if (c->argc >= 5) {
        addReply(c,shared.syntaxerr);
        return;
    }
5782

5783 5784
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
         || checkType(c,o,REDIS_ZSET)) return;
A
antirez 已提交
5785 5786 5787
    zsetobj = o->ptr;
    zsl = zsetobj->zsl;
    llen = zsl->length;
5788

A
antirez 已提交
5789 5790 5791 5792 5793
    /* convert negative indexes */
    if (start < 0) start = llen+start;
    if (end < 0) end = llen+end;
    if (start < 0) start = 0;
    if (end < 0) end = 0;
5794

A
antirez 已提交
5795 5796 5797 5798 5799 5800 5801 5802
    /* indexes sanity checks */
    if (start > end || start >= llen) {
        /* Out of range start or start > end result in empty list */
        addReply(c,shared.emptymultibulk);
        return;
    }
    if (end >= llen) end = llen-1;
    rangelen = (end-start)+1;
5803

A
antirez 已提交
5804 5805 5806 5807 5808 5809 5810 5811
    /* check if starting point is trivial, before searching
     * the element in log(N) time */
    if (reverse) {
        ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
    } else {
        ln = start == 0 ?
            zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
    }
5812

A
antirez 已提交
5813 5814 5815 5816 5817 5818 5819 5820 5821
    /* Return the result in form of a multi-bulk reply */
    addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
        withscores ? (rangelen*2) : rangelen));
    for (j = 0; j < rangelen; j++) {
        ele = ln->obj;
        addReplyBulk(c,ele);
        if (withscores)
            addReplyDouble(c,ln->score);
        ln = reverse ? ln->backward : ln->forward[0];
5822 5823 5824
    }
}

5825 5826 5827 5828 5829 5830 5831 5832
static void zrangeCommand(redisClient *c) {
    zrangeGenericCommand(c,0);
}

static void zrevrangeCommand(redisClient *c) {
    zrangeGenericCommand(c,1);
}

5833 5834 5835
/* This command implements both ZRANGEBYSCORE and ZCOUNT.
 * If justcount is non-zero, just the count is returned. */
static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
5836
    robj *o;
5837 5838
    double min, max;
    int minex = 0, maxex = 0; /* are min or max exclusive? */
5839
    int offset = 0, limit = -1;
5840 5841 5842
    int withscores = 0;
    int badsyntax = 0;

5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862
    /* Parse the min-max interval. If one of the values is prefixed
     * by the "(" character, it's considered "open". For instance
     * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
     * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
    if (((char*)c->argv[2]->ptr)[0] == '(') {
        min = strtod((char*)c->argv[2]->ptr+1,NULL);
        minex = 1;
    } else {
        min = strtod(c->argv[2]->ptr,NULL);
    }
    if (((char*)c->argv[3]->ptr)[0] == '(') {
        max = strtod((char*)c->argv[3]->ptr+1,NULL);
        maxex = 1;
    } else {
        max = strtod(c->argv[3]->ptr,NULL);
    }

    /* Parse "WITHSCORES": note that if the command was called with
     * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
     * enter the following paths to parse WITHSCORES and LIMIT. */
5863
    if (c->argc == 5 || c->argc == 8) {
5864 5865 5866 5867
        if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
            withscores = 1;
        else
            badsyntax = 1;
5868
    }
5869
    if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
5870 5871
        badsyntax = 1;
    if (badsyntax) {
5872 5873
        addReplySds(c,
            sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
5874
        return;
5875 5876
    }

5877
    /* Parse "LIMIT" */
5878
    if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
5879 5880
        addReply(c,shared.syntaxerr);
        return;
5881
    } else if (c->argc == (7 + withscores)) {
5882 5883
        offset = atoi(c->argv[5]->ptr);
        limit = atoi(c->argv[6]->ptr);
5884
        if (offset < 0) offset = 0;
5885
    }
5886

5887
    /* Ok, lookup the key and get the range */
5888 5889
    o = lookupKeyRead(c->db,c->argv[1]);
    if (o == NULL) {
5890
        addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5891 5892 5893 5894 5895 5896 5897
    } else {
        if (o->type != REDIS_ZSET) {
            addReply(c,shared.wrongtypeerr);
        } else {
            zset *zsetobj = o->ptr;
            zskiplist *zsl = zsetobj->zsl;
            zskiplistNode *ln;
5898 5899
            robj *ele, *lenobj = NULL;
            unsigned long rangelen = 0;
5900

5901 5902
            /* Get the first node with the score >= min, or with
             * score > min if 'minex' is true. */
5903
            ln = zslFirstWithScore(zsl,min);
5904 5905
            while (minex && ln && ln->score == min) ln = ln->forward[0];

5906 5907
            if (ln == NULL) {
                /* No element matching the speciifed interval */
5908
                addReply(c,justcount ? shared.czero : shared.emptymultibulk);
5909 5910 5911 5912 5913 5914 5915
                return;
            }

            /* We don't know in advance how many matching elements there
             * are in the list, so we push this object that will represent
             * the multi-bulk length in the output buffer, and will "fix"
             * it later */
5916 5917 5918 5919 5920
            if (!justcount) {
                lenobj = createObject(REDIS_STRING,NULL);
                addReply(c,lenobj);
                decrRefCount(lenobj);
            }
5921

5922
            while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
5923 5924 5925 5926 5927 5928
                if (offset) {
                    offset--;
                    ln = ln->forward[0];
                    continue;
                }
                if (limit == 0) break;
5929 5930
                if (!justcount) {
                    ele = ln->obj;
A
antirez 已提交
5931
                    addReplyBulk(c,ele);
5932 5933 5934
                    if (withscores)
                        addReplyDouble(c,ln->score);
                }
5935 5936
                ln = ln->forward[0];
                rangelen++;
5937
                if (limit > 0) limit--;
5938
            }
5939 5940 5941 5942 5943 5944
            if (justcount) {
                addReplyLong(c,(long)rangelen);
            } else {
                lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
                     withscores ? (rangelen*2) : rangelen);
            }
5945 5946 5947 5948
        }
    }
}

5949 5950 5951 5952 5953 5954 5955 5956
static void zrangebyscoreCommand(redisClient *c) {
    genericZrangebyscoreCommand(c,0);
}

static void zcountCommand(redisClient *c) {
    genericZrangebyscoreCommand(c,1);
}

5957
static void zcardCommand(redisClient *c) {
5958 5959
    robj *o;
    zset *zs;
A
antirez 已提交
5960 5961 5962 5963 5964 5965

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_ZSET)) return;

    zs = o->ptr;
    addReplyUlong(c,zs->zsl->length);
5966 5967
}

A
antirez 已提交
5968 5969 5970
static void zscoreCommand(redisClient *c) {
    robj *o;
    zset *zs;
A
antirez 已提交
5971 5972 5973 5974 5975 5976 5977 5978
    dictEntry *de;

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_ZSET)) return;

    zs = o->ptr;
    de = dictFind(zs->dict,c->argv[2]);
    if (!de) {
5979
        addReply(c,shared.nullbulk);
A
antirez 已提交
5980
    } else {
A
antirez 已提交
5981
        double *score = dictGetEntryVal(de);
A
antirez 已提交
5982

A
antirez 已提交
5983
        addReplyDouble(c,*score);
A
antirez 已提交
5984 5985 5986
    }
}

P
Pieter Noordhuis 已提交
5987
static void zrankGenericCommand(redisClient *c, int reverse) {
5988
    robj *o;
A
antirez 已提交
5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001
    zset *zs;
    zskiplist *zsl;
    dictEntry *de;
    unsigned long rank;
    double *score;

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_ZSET)) return;

    zs = o->ptr;
    zsl = zs->zsl;
    de = dictFind(zs->dict,c->argv[2]);
    if (!de) {
6002 6003 6004 6005
        addReply(c,shared.nullbulk);
        return;
    }

A
antirez 已提交
6006 6007 6008 6009 6010
    score = dictGetEntryVal(de);
    rank = zslGetRank(zsl, *score, c->argv[2]);
    if (rank) {
        if (reverse) {
            addReplyLong(c, zsl->length - rank);
6011
        } else {
A
antirez 已提交
6012
            addReplyLong(c, rank-1);
6013
        }
A
antirez 已提交
6014 6015
    } else {
        addReply(c,shared.nullbulk);
6016 6017 6018
    }
}

P
Pieter Noordhuis 已提交
6019 6020 6021 6022 6023 6024 6025 6026
static void zrankCommand(redisClient *c) {
    zrankGenericCommand(c, 0);
}

static void zrevrankCommand(redisClient *c) {
    zrankGenericCommand(c, 1);
}

6027
/* =================================== Hashes =============================== */
6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041
static void hsetCommand(redisClient *c) {
    int update = 0;
    robj *o = lookupKeyWrite(c->db,c->argv[1]);

    if (o == NULL) {
        o = createHashObject();
        dictAdd(c->db->dict,c->argv[1],o);
        incrRefCount(c->argv[1]);
    } else {
        if (o->type != REDIS_HASH) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
    }
A
antirez 已提交
6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055
    /* We want to convert the zipmap into an hash table right now if the
     * entry to be added is too big. Note that we check if the object
     * is integer encoded before to try fetching the length in the test below.
     * This is because integers are small, but currently stringObjectLen()
     * performs a slow conversion: not worth it. */
    if (o->encoding == REDIS_ENCODING_ZIPMAP &&
        ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
          sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
         (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
          sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
    {
        convertToRealHash(o);
    }

6056 6057
    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *zm = o->ptr;
A
antirez 已提交
6058
        robj *valobj = getDecodedObject(c->argv[3]);
6059 6060

        zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
A
antirez 已提交
6061 6062
            valobj->ptr,sdslen(valobj->ptr),&update);
        decrRefCount(valobj);
6063
        o->ptr = zm;
A
antirez 已提交
6064

6065 6066
        /* And here there is the second check for hash conversion. */
        if (zipmapLen(zm) > server.hash_max_zipmap_entries)
A
antirez 已提交
6067
            convertToRealHash(o);
6068
    } else {
6069
        c->argv[2] = tryObjectEncoding(c->argv[2]);
A
antirez 已提交
6070 6071
        /* note that c->argv[3] is already encoded, as the latest arg
         * of a bulk command is always integer encoded if possible. */
6072
        if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
6073 6074 6075 6076 6077 6078 6079 6080 6081 6082
            incrRefCount(c->argv[2]);
        } else {
            update = 1;
        }
        incrRefCount(c->argv[3]);
    }
    server.dirty++;
    addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
}

P
Pieter Noordhuis 已提交
6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146
static void hmsetCommand(redisClient *c) {
    int i;
    robj *o, *key, *val;

    if ((c->argc % 2) == 1) {
        addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
        return;
    }

    if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
        o = createHashObject();
        dictAdd(c->db->dict,c->argv[1],o);
        incrRefCount(c->argv[1]);
    } else {
        if (o->type != REDIS_HASH) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
    }

    /* We want to convert the zipmap into an hash table right now if the
     * entry to be added is too big. */
    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        for (i = 2; i < c->argc; i+=2) {
            if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
                  sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
                (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
                  sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
                convertToRealHash(o);
                break;
            }
        }
    }

    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *zm = o->ptr;

        for (i = 2; i < c->argc; i+=2) {
            key = getDecodedObject(c->argv[i]);
            val = getDecodedObject(c->argv[i+1]);
            zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
                              val->ptr,sdslen(val->ptr),NULL);
            decrRefCount(key);
            decrRefCount(val);
            o->ptr = zm;
        }

        /* And here there is the second check for hash conversion. */
        if (zipmapLen(zm) > server.hash_max_zipmap_entries)
            convertToRealHash(o);
    } else {
        for (i = 2; i < c->argc; i+=2) {
            key = tryObjectEncoding(c->argv[i]);
            val = tryObjectEncoding(c->argv[i+1]);
            if (dictReplace(o->ptr,key,val)) {
                incrRefCount(key);
            }
            incrRefCount(val);
        }
    }

    addReply(c, shared.ok);
}

6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161
static void hincrbyCommand(redisClient *c) {
    long long value = 0, incr = 0;
    robj *o = lookupKeyWrite(c->db,c->argv[1]);

    if (o == NULL) {
        o = createHashObject();
        dictAdd(c->db->dict,c->argv[1],o);
        incrRefCount(c->argv[1]);
    } else {
        if (o->type != REDIS_HASH) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
    }

A
Alex McHale 已提交
6162 6163
    if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;

6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181
    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *zm = o->ptr;
        unsigned char *zval;
        unsigned int zvlen;

        /* Find value if already present in hash */
        if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
            &zval,&zvlen)) {
            /* strtoll needs the char* to have a trailing \0, but
             * the zipmap doesn't include them. */
            sds szval = sdsnewlen(zval, zvlen);
            value = strtoll(szval,NULL,10);
            sdsfree(szval);
        }

        value += incr;
        sds svalue = sdscatprintf(sdsempty(),"%lld",value);
        zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6182
            (unsigned char*)svalue,sdslen(svalue),NULL);
6183 6184 6185
        sdsfree(svalue);
        o->ptr = zm;

6186 6187
        /* Check if the zipmap needs to be converted. */
        if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206
            convertToRealHash(o);
    } else {
        robj *hval;
        dictEntry *de;

        /* Find value if already present in hash */
        de = dictFind(o->ptr,c->argv[2]);
        if (de != NULL) {
            hval = dictGetEntryVal(de);
            if (hval->encoding == REDIS_ENCODING_RAW)
                value = strtoll(hval->ptr,NULL,10);
            else if (hval->encoding == REDIS_ENCODING_INT)
                value = (long)hval->ptr;
            else
                redisAssert(1 != 1);
        }

        value += incr;
        hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6207
        hval = tryObjectEncoding(hval);
6208 6209 6210 6211 6212 6213
        if (dictReplace(o->ptr,c->argv[2],hval)) {
            incrRefCount(c->argv[2]);
        }
    }

    server.dirty++;
6214
    addReplyLongLong(c, value);
6215 6216
}

6217
static void hgetCommand(redisClient *c) {
A
antirez 已提交
6218
    robj *o;
6219

A
antirez 已提交
6220 6221 6222 6223 6224 6225 6226
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
        checkType(c,o,REDIS_HASH)) return;

    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *zm = o->ptr;
        unsigned char *val;
        unsigned int vlen;
6227
        robj *field;
A
antirez 已提交
6228

6229 6230
        field = getDecodedObject(c->argv[2]);
        if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
A
antirez 已提交
6231 6232 6233
            addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
            addReplySds(c,sdsnewlen(val,vlen));
            addReply(c,shared.crlf);
6234
            decrRefCount(field);
A
antirez 已提交
6235 6236 6237
            return;
        } else {
            addReply(c,shared.nullbulk);
6238
            decrRefCount(field);
6239 6240
            return;
        }
A
antirez 已提交
6241 6242
    } else {
        struct dictEntry *de;
6243

A
antirez 已提交
6244 6245 6246
        de = dictFind(o->ptr,c->argv[2]);
        if (de == NULL) {
            addReply(c,shared.nullbulk);
6247
        } else {
A
antirez 已提交
6248
            robj *e = dictGetEntryVal(de);
6249

A
antirez 已提交
6250
            addReplyBulk(c,e);
6251
        }
6252 6253 6254
    }
}

P
Pieter Noordhuis 已提交
6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303
static void hmgetCommand(redisClient *c) {
    int i;

    robj *o = lookupKeyRead(c->db, c->argv[1]);
    if (o == NULL) {
        addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
        for (i = 2; i < c->argc; i++) {
            addReply(c,shared.nullbulk);
        }
        return;
    } else {
        if (o->type != REDIS_HASH) {
            addReply(c,shared.wrongtypeerr);
            return;
        }
    }

    addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *zm = o->ptr;
        unsigned char *v;
        unsigned int vlen;
        robj *field;

        for (i = 2; i < c->argc; i++) {
            field = getDecodedObject(c->argv[i]);
            if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
                addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
                addReplySds(c,sdsnewlen(v,vlen));
                addReply(c,shared.crlf);
            } else {
                addReply(c,shared.nullbulk);
            }
            decrRefCount(field);
        }
    } else {
        dictEntry *de;

        for (i = 2; i < c->argc; i++) {
            de = dictFind(o->ptr,c->argv[i]);
            if (de != NULL) {
                addReplyBulk(c,(robj*)dictGetEntryVal(de));
            } else {
                addReply(c,shared.nullbulk);
            }
        }
    }
}

6304
static void hdelCommand(redisClient *c) {
A
antirez 已提交
6305 6306
    robj *o;
    int deleted = 0;
6307

A
antirez 已提交
6308 6309
    if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_HASH)) return;
6310

A
antirez 已提交
6311
    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6312 6313
        robj *field = getDecodedObject(c->argv[2]);

A
antirez 已提交
6314
        o->ptr = zipmapDel((unsigned char*) o->ptr,
6315 6316 6317
            (unsigned char*) field->ptr,
            sdslen(field->ptr), &deleted);
        decrRefCount(field);
6318 6319
        if (zipmapLen((unsigned char*) o->ptr) == 0)
            deleteKey(c->db,c->argv[1]);
A
antirez 已提交
6320 6321
    } else {
        deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
6322 6323
        if (htNeedsResize(o->ptr)) dictResize(o->ptr);
        if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
6324
    }
A
antirez 已提交
6325
    if (deleted) server.dirty++;
A
antirez 已提交
6326
    addReply(c,deleted ? shared.cone : shared.czero);
6327 6328
}

6329 6330 6331 6332
static void hlenCommand(redisClient *c) {
    robj *o;
    unsigned long len;

A
antirez 已提交
6333
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6334 6335 6336 6337 6338 6339 6340
        checkType(c,o,REDIS_HASH)) return;

    len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
            zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
    addReplyUlong(c,len);
}

A
antirez 已提交
6341 6342 6343 6344 6345 6346
#define REDIS_GETALL_KEYS 1
#define REDIS_GETALL_VALS 2
static void genericHgetallCommand(redisClient *c, int flags) {
    robj *o, *lenobj;
    unsigned long count = 0;

6347
    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
A
antirez 已提交
6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408
        || checkType(c,o,REDIS_HASH)) return;

    lenobj = createObject(REDIS_STRING,NULL);
    addReply(c,lenobj);
    decrRefCount(lenobj);

    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        unsigned char *p = zipmapRewind(o->ptr);
        unsigned char *field, *val;
        unsigned int flen, vlen;

        while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
            robj *aux;

            if (flags & REDIS_GETALL_KEYS) {
                aux = createStringObject((char*)field,flen);
                addReplyBulk(c,aux);
                decrRefCount(aux);
                count++;
            }
            if (flags & REDIS_GETALL_VALS) {
                aux = createStringObject((char*)val,vlen);
                addReplyBulk(c,aux);
                decrRefCount(aux);
                count++;
            }
        }
    } else {
        dictIterator *di = dictGetIterator(o->ptr);
        dictEntry *de;

        while((de = dictNext(di)) != NULL) {
            robj *fieldobj = dictGetEntryKey(de);
            robj *valobj = dictGetEntryVal(de);

            if (flags & REDIS_GETALL_KEYS) {
                addReplyBulk(c,fieldobj);
                count++;
            }
            if (flags & REDIS_GETALL_VALS) {
                addReplyBulk(c,valobj);
                count++;
            }
        }
        dictReleaseIterator(di);
    }
    lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
}

static void hkeysCommand(redisClient *c) {
    genericHgetallCommand(c,REDIS_GETALL_KEYS);
}

static void hvalsCommand(redisClient *c) {
    genericHgetallCommand(c,REDIS_GETALL_VALS);
}

static void hgetallCommand(redisClient *c) {
    genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
}

A
antirez 已提交
6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428
static void hexistsCommand(redisClient *c) {
    robj *o;
    int exists = 0;

    if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
        checkType(c,o,REDIS_HASH)) return;

    if (o->encoding == REDIS_ENCODING_ZIPMAP) {
        robj *field;
        unsigned char *zm = o->ptr;

        field = getDecodedObject(c->argv[2]);
        exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
        decrRefCount(field);
    } else {
        exists = dictFind(o->ptr,c->argv[2]) != NULL;
    }
    addReply(c,exists ? shared.cone : shared.czero);
}

6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440
static void convertToRealHash(robj *o) {
    unsigned char *key, *val, *p, *zm = o->ptr;
    unsigned int klen, vlen;
    dict *dict = dictCreate(&hashDictType,NULL);

    assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
    p = zipmapRewind(zm);
    while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
        robj *keyobj, *valobj;

        keyobj = createStringObject((char*)key,klen);
        valobj = createStringObject((char*)val,vlen);
6441 6442
        keyobj = tryObjectEncoding(keyobj);
        valobj = tryObjectEncoding(valobj);
6443 6444 6445 6446 6447 6448 6449
        dictAdd(dict,keyobj,valobj);
    }
    o->encoding = REDIS_ENCODING_HT;
    o->ptr = dict;
    zfree(zm);
}

6450 6451
/* ========================= Non type-specific commands  ==================== */

A
antirez 已提交
6452
static void flushdbCommand(redisClient *c) {
6453
    server.dirty += dictSize(c->db->dict);
A
antirez 已提交
6454 6455
    dictEmpty(c->db->dict);
    dictEmpty(c->db->expires);
A
antirez 已提交
6456 6457 6458 6459
    addReply(c,shared.ok);
}

static void flushallCommand(redisClient *c) {
6460
    server.dirty += emptyDb();
A
antirez 已提交
6461
    addReply(c,shared.ok);
6462 6463 6464 6465
    if (server.bgsavechildpid != -1) {
        kill(server.bgsavechildpid,SIGKILL);
        rdbRemoveTempFile(server.bgsavechildpid);
    }
6466
    rdbSave(server.dbfilename);
6467
    server.dirty++;
A
antirez 已提交
6468 6469
}

6470
static redisSortOperation *createSortOperation(int type, robj *pattern) {
A
antirez 已提交
6471 6472 6473 6474 6475 6476 6477 6478
    redisSortOperation *so = zmalloc(sizeof(*so));
    so->type = type;
    so->pattern = pattern;
    return so;
}

/* Return the value associated to the key with a name obtained
 * substituting the first occurence of '*' in 'pattern' with 'subst' */
6479
static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
A
antirez 已提交
6480 6481 6482 6483 6484 6485
    char *p;
    sds spat, ssub;
    robj keyobj;
    int prefixlen, sublen, postfixlen;
    /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
    struct {
6486 6487
        long len;
        long free;
A
antirez 已提交
6488 6489 6490
        char buf[REDIS_SORTKEY_MAX+1];
    } keyname;

A
antirez 已提交
6491 6492 6493 6494 6495 6496 6497 6498
    /* If the pattern is "#" return the substitution object itself in order
     * to implement the "SORT ... GET #" feature. */
    spat = pattern->ptr;
    if (spat[0] == '#' && spat[1] == '\0') {
        return subst;
    }

    /* The substitution object may be specially encoded. If so we create
6499 6500 6501
     * a decoded object on the fly. Otherwise getDecodedObject will just
     * increment the ref count, that we'll decrement later. */
    subst = getDecodedObject(subst);
6502

A
antirez 已提交
6503 6504 6505
    ssub = subst->ptr;
    if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
    p = strchr(spat,'*');
6506 6507 6508 6509
    if (!p) {
        decrRefCount(subst);
        return NULL;
    }
A
antirez 已提交
6510 6511 6512 6513 6514 6515 6516 6517 6518 6519

    prefixlen = p-spat;
    sublen = sdslen(ssub);
    postfixlen = sdslen(spat)-(prefixlen+1);
    memcpy(keyname.buf,spat,prefixlen);
    memcpy(keyname.buf+prefixlen,ssub,sublen);
    memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
    keyname.buf[prefixlen+sublen+postfixlen] = '\0';
    keyname.len = prefixlen+sublen+postfixlen;

6520
    initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
6521 6522
    decrRefCount(subst);

A
antirez 已提交
6523
    /* printf("lookup '%s' => %p\n", keyname.buf,de); */
A
antirez 已提交
6524
    return lookupKeyRead(db,&keyobj);
A
antirez 已提交
6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559
}

/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
 * the additional parameter is not standard but a BSD-specific we have to
 * pass sorting parameters via the global 'server' structure */
static int sortCompare(const void *s1, const void *s2) {
    const redisSortObject *so1 = s1, *so2 = s2;
    int cmp;

    if (!server.sort_alpha) {
        /* Numeric sorting. Here it's trivial as we precomputed scores */
        if (so1->u.score > so2->u.score) {
            cmp = 1;
        } else if (so1->u.score < so2->u.score) {
            cmp = -1;
        } else {
            cmp = 0;
        }
    } else {
        /* Alphanumeric sorting */
        if (server.sort_bypattern) {
            if (!so1->u.cmpobj || !so2->u.cmpobj) {
                /* At least one compare object is NULL */
                if (so1->u.cmpobj == so2->u.cmpobj)
                    cmp = 0;
                else if (so1->u.cmpobj == NULL)
                    cmp = -1;
                else
                    cmp = 1;
            } else {
                /* We have both the objects, use strcoll */
                cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
            }
        } else {
            /* Compare elements directly */
6560 6561 6562 6563 6564 6565 6566
            robj *dec1, *dec2;

            dec1 = getDecodedObject(so1->obj);
            dec2 = getDecodedObject(so2->obj);
            cmp = strcoll(dec1->ptr,dec2->ptr);
            decrRefCount(dec1);
            decrRefCount(dec2);
A
antirez 已提交
6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580
        }
    }
    return server.sort_desc ? -cmp : cmp;
}

/* The SORT command is the most complex command in Redis. Warning: this code
 * is optimized for speed and a bit less for readability */
static void sortCommand(redisClient *c) {
    list *operations;
    int outputlen = 0;
    int desc = 0, alpha = 0;
    int limit_start = 0, limit_count = -1, start, end;
    int j, dontsort = 0, vectorlen;
    int getop = 0; /* GET operation counter */
A
antirez 已提交
6581
    robj *sortval, *sortby = NULL, *storekey = NULL;
A
antirez 已提交
6582 6583 6584
    redisSortObject *vector; /* Resulting vector to sort */

    /* Lookup the key to sort. It must be of the right types */
A
antirez 已提交
6585 6586
    sortval = lookupKeyRead(c->db,c->argv[1]);
    if (sortval == NULL) {
6587
        addReply(c,shared.emptymultibulk);
A
antirez 已提交
6588 6589
        return;
    }
A
antirez 已提交
6590 6591 6592
    if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
        sortval->type != REDIS_ZSET)
    {
6593
        addReply(c,shared.wrongtypeerr);
A
antirez 已提交
6594 6595 6596 6597 6598 6599
        return;
    }

    /* Create a list of operations to perform for every sorted element.
     * Operations can be GET/DEL/INCR/DECR */
    operations = listCreate();
6600
    listSetFreeMethod(operations,zfree);
A
antirez 已提交
6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620
    j = 2;

    /* Now we need to protect sortval incrementing its count, in the future
     * SORT may have options able to overwrite/delete keys during the sorting
     * and the sorted key itself may get destroied */
    incrRefCount(sortval);

    /* The SORT command has an SQL-alike syntax, parse it */
    while(j < c->argc) {
        int leftargs = c->argc-j-1;
        if (!strcasecmp(c->argv[j]->ptr,"asc")) {
            desc = 0;
        } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
            desc = 1;
        } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
            alpha = 1;
        } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
            limit_start = atoi(c->argv[j+1]->ptr);
            limit_count = atoi(c->argv[j+2]->ptr);
            j+=2;
A
antirez 已提交
6621 6622 6623
        } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
            storekey = c->argv[j+1];
            j++;
A
antirez 已提交
6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637
        } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
            sortby = c->argv[j+1];
            /* If the BY pattern does not contain '*', i.e. it is constant,
             * we don't need to sort nor to lookup the weight keys. */
            if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
            j++;
        } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
            listAddNodeTail(operations,createSortOperation(
                REDIS_SORT_GET,c->argv[j+1]));
            getop++;
            j++;
        } else {
            decrRefCount(sortval);
            listRelease(operations);
6638
            addReply(c,shared.syntaxerr);
A
antirez 已提交
6639 6640 6641 6642 6643 6644
            return;
        }
        j++;
    }

    /* Load the sorting vector with all the objects to sort */
A
antirez 已提交
6645 6646 6647 6648
    switch(sortval->type) {
    case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
    case REDIS_SET: vectorlen =  dictSize((dict*)sortval->ptr); break;
    case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
6649
    default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
A
antirez 已提交
6650
    }
A
antirez 已提交
6651 6652
    vector = zmalloc(sizeof(redisSortObject)*vectorlen);
    j = 0;
A
antirez 已提交
6653

A
antirez 已提交
6654 6655
    if (sortval->type == REDIS_LIST) {
        list *list = sortval->ptr;
6656
        listNode *ln;
A
antirez 已提交
6657
        listIter li;
6658

A
antirez 已提交
6659 6660
        listRewind(list,&li);
        while((ln = listNext(&li))) {
A
antirez 已提交
6661 6662 6663 6664 6665 6666 6667
            robj *ele = ln->value;
            vector[j].obj = ele;
            vector[j].u.score = 0;
            vector[j].u.cmpobj = NULL;
            j++;
        }
    } else {
A
antirez 已提交
6668
        dict *set;
A
antirez 已提交
6669 6670 6671
        dictIterator *di;
        dictEntry *setele;

A
antirez 已提交
6672 6673 6674 6675 6676 6677 6678
        if (sortval->type == REDIS_SET) {
            set = sortval->ptr;
        } else {
            zset *zs = sortval->ptr;
            set = zs->dict;
        }

A
antirez 已提交
6679 6680 6681 6682 6683 6684 6685 6686 6687
        di = dictGetIterator(set);
        while((setele = dictNext(di)) != NULL) {
            vector[j].obj = dictGetEntryKey(setele);
            vector[j].u.score = 0;
            vector[j].u.cmpobj = NULL;
            j++;
        }
        dictReleaseIterator(di);
    }
6688
    redisAssert(j == vectorlen);
A
antirez 已提交
6689 6690 6691 6692 6693 6694 6695

    /* Now it's time to load the right scores in the sorting vector */
    if (dontsort == 0) {
        for (j = 0; j < vectorlen; j++) {
            if (sortby) {
                robj *byval;

A
antirez 已提交
6696
                byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
A
antirez 已提交
6697 6698
                if (!byval || byval->type != REDIS_STRING) continue;
                if (alpha) {
6699
                    vector[j].u.cmpobj = getDecodedObject(byval);
A
antirez 已提交
6700
                } else {
6701 6702 6703
                    if (byval->encoding == REDIS_ENCODING_RAW) {
                        vector[j].u.score = strtod(byval->ptr,NULL);
                    } else {
6704 6705 6706
                        /* Don't need to decode the object if it's
                         * integer-encoded (the only encoding supported) so
                         * far. We can just cast it */
6707
                        if (byval->encoding == REDIS_ENCODING_INT) {
6708
                            vector[j].u.score = (long)byval->ptr;
6709
                        } else
6710
                            redisAssert(1 != 1);
6711
                    }
A
antirez 已提交
6712 6713
                }
            } else {
6714 6715 6716 6717 6718 6719 6720
                if (!alpha) {
                    if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
                        vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
                    else {
                        if (vector[j].obj->encoding == REDIS_ENCODING_INT)
                            vector[j].u.score = (long) vector[j].obj->ptr;
                        else
6721
                            redisAssert(1 != 1);
6722 6723
                    }
                }
A
antirez 已提交
6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741
            }
        }
    }

    /* We are ready to sort the vector... perform a bit of sanity check
     * on the LIMIT option too. We'll use a partial version of quicksort. */
    start = (limit_start < 0) ? 0 : limit_start;
    end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
    if (start >= vectorlen) {
        start = vectorlen-1;
        end = vectorlen-2;
    }
    if (end >= vectorlen) end = vectorlen-1;

    if (dontsort == 0) {
        server.sort_desc = desc;
        server.sort_alpha = alpha;
        server.sort_bypattern = sortby ? 1 : 0;
6742 6743 6744 6745
        if (sortby && (start != 0 || end != vectorlen-1))
            pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
        else
            qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
A
antirez 已提交
6746 6747 6748 6749 6750
    }

    /* Send command output to the output buffer, performing the specified
     * GET/DEL/INCR/DECR operations if any. */
    outputlen = getop ? getop*(end-start+1) : end-start+1;
A
antirez 已提交
6751 6752 6753 6754 6755
    if (storekey == NULL) {
        /* STORE option not specified, sent the sorting result to client */
        addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
        for (j = start; j <= end; j++) {
            listNode *ln;
A
antirez 已提交
6756 6757
            listIter li;

A
antirez 已提交
6758
            if (!getop) addReplyBulk(c,vector[j].obj);
A
antirez 已提交
6759 6760
            listRewind(operations,&li);
            while((ln = listNext(&li))) {
A
antirez 已提交
6761 6762 6763 6764 6765 6766 6767 6768
                redisSortOperation *sop = ln->value;
                robj *val = lookupKeyByPattern(c->db,sop->pattern,
                    vector[j].obj);

                if (sop->type == REDIS_SORT_GET) {
                    if (!val || val->type != REDIS_STRING) {
                        addReply(c,shared.nullbulk);
                    } else {
A
antirez 已提交
6769
                        addReplyBulk(c,val);
A
antirez 已提交
6770 6771
                    }
                } else {
6772
                    redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
A
antirez 已提交
6773 6774
                }
            }
A
antirez 已提交
6775
        }
A
antirez 已提交
6776 6777 6778 6779 6780 6781 6782
    } else {
        robj *listObject = createListObject();
        list *listPtr = (list*) listObject->ptr;

        /* STORE option specified, set the sorting result as a List object */
        for (j = start; j <= end; j++) {
            listNode *ln;
A
antirez 已提交
6783 6784
            listIter li;

A
antirez 已提交
6785 6786 6787 6788
            if (!getop) {
                listAddNodeTail(listPtr,vector[j].obj);
                incrRefCount(vector[j].obj);
            }
A
antirez 已提交
6789 6790
            listRewind(operations,&li);
            while((ln = listNext(&li))) {
A
antirez 已提交
6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801
                redisSortOperation *sop = ln->value;
                robj *val = lookupKeyByPattern(c->db,sop->pattern,
                    vector[j].obj);

                if (sop->type == REDIS_SORT_GET) {
                    if (!val || val->type != REDIS_STRING) {
                        listAddNodeTail(listPtr,createStringObject("",0));
                    } else {
                        listAddNodeTail(listPtr,val);
                        incrRefCount(val);
                    }
A
antirez 已提交
6802
                } else {
6803
                    redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
A
antirez 已提交
6804 6805 6806
                }
            }
        }
6807 6808 6809
        if (dictReplace(c->db->dict,storekey,listObject)) {
            incrRefCount(storekey);
        }
A
antirez 已提交
6810 6811 6812 6813 6814
        /* Note: we add 1 because the DB is dirty anyway since even if the
         * SORT result is empty a new key is set and maybe the old content
         * replaced. */
        server.dirty += 1+outputlen;
        addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
A
antirez 已提交
6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826
    }

    /* Cleanup */
    decrRefCount(sortval);
    listRelease(operations);
    for (j = 0; j < vectorlen; j++) {
        if (sortby && alpha && vector[j].u.cmpobj)
            decrRefCount(vector[j].u.cmpobj);
    }
    zfree(vector);
}

6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843
/* Convert an amount of bytes into a human readable string in the form
 * of 100B, 2G, 100M, 4K, and so forth. */
static void bytesToHuman(char *s, unsigned long long n) {
    double d;

    if (n < 1024) {
        /* Bytes */
        sprintf(s,"%lluB",n);
        return;
    } else if (n < (1024*1024)) {
        d = (double)n/(1024);
        sprintf(s,"%.2fK",d);
    } else if (n < (1024LL*1024*1024)) {
        d = (double)n/(1024*1024);
        sprintf(s,"%.2fM",d);
    } else if (n < (1024LL*1024*1024*1024)) {
        d = (double)n/(1024LL*1024*1024);
6844
        sprintf(s,"%.2fG",d);
6845 6846 6847
    }
}

6848 6849 6850 6851
/* Create the string returned by the INFO command. This is decoupled
 * by the INFO command itself as we need to report the same information
 * on memory corruption problems. */
static sds genRedisInfoString(void) {
A
antirez 已提交
6852 6853
    sds info;
    time_t uptime = time(NULL)-server.stat_starttime;
6854
    int j;
6855
    char hmem[64];
6856

6857
    bytesToHuman(hmem,zmalloc_used_memory());
A
antirez 已提交
6858 6859
    info = sdscatprintf(sdsempty(),
        "redis_version:%s\r\n"
6860
        "arch_bits:%s\r\n"
6861
        "multiplexing_api:%s\r\n"
A
antirez 已提交
6862
        "process_id:%ld\r\n"
6863 6864
        "uptime_in_seconds:%ld\r\n"
        "uptime_in_days:%ld\r\n"
A
antirez 已提交
6865 6866
        "connected_clients:%d\r\n"
        "connected_slaves:%d\r\n"
A
antirez 已提交
6867
        "blocked_clients:%d\r\n"
B
Bob Potter 已提交
6868
        "used_memory:%zu\r\n"
6869
        "used_memory_human:%s\r\n"
A
antirez 已提交
6870
        "changes_since_last_save:%lld\r\n"
6871
        "bgsave_in_progress:%d\r\n"
6872
        "last_save_time:%ld\r\n"
A
antirez 已提交
6873
        "bgrewriteaof_in_progress:%d\r\n"
A
antirez 已提交
6874 6875
        "total_connections_received:%lld\r\n"
        "total_commands_processed:%lld\r\n"
A
antirez 已提交
6876
        "expired_keys:%lld\r\n"
6877 6878
        "hash_max_zipmap_entries:%ld\r\n"
        "hash_max_zipmap_value:%ld\r\n"
A
antirez 已提交
6879 6880
        "pubsub_channels:%ld\r\n"
        "pubsub_patterns:%u\r\n"
A
antirez 已提交
6881
        "vm_enabled:%d\r\n"
6882
        "role:%s\r\n"
A
antirez 已提交
6883
        ,REDIS_VERSION,
6884
        (sizeof(long) == 8) ? "64" : "32",
6885
        aeGetApiName(),
A
antirez 已提交
6886
        (long) getpid(),
6887 6888
        uptime,
        uptime/(3600*24),
A
antirez 已提交
6889 6890
        listLength(server.clients)-listLength(server.slaves),
        listLength(server.slaves),
A
antirez 已提交
6891
        server.blpop_blocked_clients,
6892
        zmalloc_used_memory(),
6893
        hmem,
A
antirez 已提交
6894
        server.dirty,
6895
        server.bgsavechildpid != -1,
A
antirez 已提交
6896
        server.lastsave,
A
antirez 已提交
6897
        server.bgrewritechildpid != -1,
A
antirez 已提交
6898 6899
        server.stat_numconnections,
        server.stat_numcommands,
A
antirez 已提交
6900
        server.stat_expiredkeys,
6901 6902
        server.hash_max_zipmap_entries,
        server.hash_max_zipmap_value,
A
antirez 已提交
6903 6904
        dictSize(server.pubsub_channels),
        listLength(server.pubsub_patterns),
A
antirez 已提交
6905
        server.vm_enabled != 0,
6906
        server.masterhost == NULL ? "master" : "slave"
A
antirez 已提交
6907
    );
6908 6909 6910 6911 6912 6913 6914 6915 6916 6917
    if (server.masterhost) {
        info = sdscatprintf(info,
            "master_host:%s\r\n"
            "master_port:%d\r\n"
            "master_link_status:%s\r\n"
            "master_last_io_seconds_ago:%d\r\n"
            ,server.masterhost,
            server.masterport,
            (server.replstate == REDIS_REPL_CONNECTED) ?
                "up" : "down",
6918
            server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
6919 6920
        );
    }
A
antirez 已提交
6921
    if (server.vm_enabled) {
A
antirez 已提交
6922
        lockThreadedIO();
A
antirez 已提交
6923 6924 6925 6926 6927 6928 6929 6930
        info = sdscatprintf(info,
            "vm_conf_max_memory:%llu\r\n"
            "vm_conf_page_size:%llu\r\n"
            "vm_conf_pages:%llu\r\n"
            "vm_stats_used_pages:%llu\r\n"
            "vm_stats_swapped_objects:%llu\r\n"
            "vm_stats_swappin_count:%llu\r\n"
            "vm_stats_swappout_count:%llu\r\n"
6931 6932 6933
            "vm_stats_io_newjobs_len:%lu\r\n"
            "vm_stats_io_processing_len:%lu\r\n"
            "vm_stats_io_processed_len:%lu\r\n"
6934
            "vm_stats_io_active_threads:%lu\r\n"
A
antirez 已提交
6935
            "vm_stats_blocked_clients:%lu\r\n"
A
antirez 已提交
6936 6937 6938 6939 6940 6941
            ,(unsigned long long) server.vm_max_memory,
            (unsigned long long) server.vm_page_size,
            (unsigned long long) server.vm_pages,
            (unsigned long long) server.vm_stats_used_pages,
            (unsigned long long) server.vm_stats_swapped_objects,
            (unsigned long long) server.vm_stats_swapins,
6942 6943 6944 6945
            (unsigned long long) server.vm_stats_swapouts,
            (unsigned long) listLength(server.io_newjobs),
            (unsigned long) listLength(server.io_processing),
            (unsigned long) listLength(server.io_processed),
A
antirez 已提交
6946 6947
            (unsigned long) server.io_active_threads,
            (unsigned long) server.vm_blocked_clients
A
antirez 已提交
6948
        );
A
antirez 已提交
6949
        unlockThreadedIO();
A
antirez 已提交
6950
    }
6951 6952 6953 6954 6955 6956
    for (j = 0; j < server.dbnum; j++) {
        long long keys, vkeys;

        keys = dictSize(server.db[j].dict);
        vkeys = dictSize(server.db[j].expires);
        if (keys || vkeys) {
6957
            info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
6958 6959 6960
                j, keys, vkeys);
        }
    }
6961 6962 6963 6964 6965
    return info;
}

static void infoCommand(redisClient *c) {
    sds info = genRedisInfoString();
A
antirez 已提交
6966 6967
    addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
        (unsigned long)sdslen(info)));
A
antirez 已提交
6968
    addReplySds(c,info);
6969
    addReply(c,shared.crlf);
A
antirez 已提交
6970 6971
}

A
antirez 已提交
6972 6973 6974 6975 6976 6977
static void monitorCommand(redisClient *c) {
    /* ignore MONITOR if aleady slave or in monitor mode */
    if (c->flags & REDIS_SLAVE) return;

    c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
    c->slaveseldb = 0;
6978
    listAddNodeTail(server.monitors,c);
A
antirez 已提交
6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999
    addReply(c,shared.ok);
}

/* ================================= Expire ================================= */
static int removeExpire(redisDb *db, robj *key) {
    if (dictDelete(db->expires,key) == DICT_OK) {
        return 1;
    } else {
        return 0;
    }
}

static int setExpire(redisDb *db, robj *key, time_t when) {
    if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
        return 0;
    } else {
        incrRefCount(key);
        return 1;
    }
}

A
antirez 已提交
7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011
/* Return the expire time of the specified key, or -1 if no expire
 * is associated with this key (i.e. the key is non volatile) */
static time_t getExpire(redisDb *db, robj *key) {
    dictEntry *de;

    /* No expire? return ASAP */
    if (dictSize(db->expires) == 0 ||
       (de = dictFind(db->expires,key)) == NULL) return -1;

    return (time_t) dictGetEntryVal(de);
}

A
antirez 已提交
7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025
static int expireIfNeeded(redisDb *db, robj *key) {
    time_t when;
    dictEntry *de;

    /* No expire? return ASAP */
    if (dictSize(db->expires) == 0 ||
       (de = dictFind(db->expires,key)) == NULL) return 0;

    /* Lookup the expire */
    when = (time_t) dictGetEntryVal(de);
    if (time(NULL) <= when) return 0;

    /* Delete the key */
    dictDelete(db->expires,key);
A
antirez 已提交
7026
    server.stat_expiredkeys++;
A
antirez 已提交
7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037
    return dictDelete(db->dict,key) == DICT_OK;
}

static int deleteIfVolatile(redisDb *db, robj *key) {
    dictEntry *de;

    /* No expire? return ASAP */
    if (dictSize(db->expires) == 0 ||
       (de = dictFind(db->expires,key)) == NULL) return 0;

    /* Delete the key */
7038
    server.dirty++;
A
antirez 已提交
7039
    server.stat_expiredkeys++;
A
antirez 已提交
7040 7041 7042 7043
    dictDelete(db->expires,key);
    return dictDelete(db->dict,key) == DICT_OK;
}

A
Alex McHale 已提交
7044
static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
A
antirez 已提交
7045
    dictEntry *de;
A
Alex McHale 已提交
7046 7047 7048 7049 7050
    time_t seconds;

    if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;

    seconds -= offset;
A
antirez 已提交
7051

7052
    de = dictFind(c->db->dict,key);
A
antirez 已提交
7053 7054 7055 7056
    if (de == NULL) {
        addReply(c,shared.czero);
        return;
    }
7057 7058 7059
    if (seconds < 0) {
        if (deleteKey(c->db,key)) server.dirty++;
        addReply(c, shared.cone);
A
antirez 已提交
7060 7061 7062
        return;
    } else {
        time_t when = time(NULL)+seconds;
7063
        if (setExpire(c->db,key,when)) {
A
antirez 已提交
7064
            addReply(c,shared.cone);
7065 7066
            server.dirty++;
        } else {
A
antirez 已提交
7067
            addReply(c,shared.czero);
7068
        }
A
antirez 已提交
7069 7070 7071 7072
        return;
    }
}

7073
static void expireCommand(redisClient *c) {
A
Alex McHale 已提交
7074
    expireGenericCommand(c,c->argv[1],c->argv[2],0);
7075 7076 7077
}

static void expireatCommand(redisClient *c) {
A
Alex McHale 已提交
7078
    expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
7079 7080
}

A
antirez 已提交
7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092
static void ttlCommand(redisClient *c) {
    time_t expire;
    int ttl = -1;

    expire = getExpire(c->db,c->argv[1]);
    if (expire != -1) {
        ttl = (int) (expire-time(NULL));
        if (ttl < 0) ttl = -1;
    }
    addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
}

A
antirez 已提交
7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134
/* ================================ MULTI/EXEC ============================== */

/* Client state initialization for MULTI/EXEC */
static void initClientMultiState(redisClient *c) {
    c->mstate.commands = NULL;
    c->mstate.count = 0;
}

/* Release all the resources associated with MULTI/EXEC state */
static void freeClientMultiState(redisClient *c) {
    int j;

    for (j = 0; j < c->mstate.count; j++) {
        int i;
        multiCmd *mc = c->mstate.commands+j;

        for (i = 0; i < mc->argc; i++)
            decrRefCount(mc->argv[i]);
        zfree(mc->argv);
    }
    zfree(c->mstate.commands);
}

/* Add a new command into the MULTI commands queue */
static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
    multiCmd *mc;
    int j;

    c->mstate.commands = zrealloc(c->mstate.commands,
            sizeof(multiCmd)*(c->mstate.count+1));
    mc = c->mstate.commands+c->mstate.count;
    mc->cmd = cmd;
    mc->argc = c->argc;
    mc->argv = zmalloc(sizeof(robj*)*c->argc);
    memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
    for (j = 0; j < c->argc; j++)
        incrRefCount(mc->argv[j]);
    c->mstate.count++;
}

static void multiCommand(redisClient *c) {
    c->flags |= REDIS_MULTI;
A
antirez 已提交
7135
    addReply(c,shared.ok);
A
antirez 已提交
7136 7137
}

7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149
static void discardCommand(redisClient *c) {
    if (!(c->flags & REDIS_MULTI)) {
        addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
        return;
    }

    freeClientMultiState(c);
    initClientMultiState(c);
    c->flags &= (~REDIS_MULTI);
    addReply(c,shared.ok);
}

A
antirez 已提交
7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174
static void execCommand(redisClient *c) {
    int j;
    robj **orig_argv;
    int orig_argc;

    if (!(c->flags & REDIS_MULTI)) {
        addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
        return;
    }

    orig_argv = c->argv;
    orig_argc = c->argc;
    addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
    for (j = 0; j < c->mstate.count; j++) {
        c->argc = c->mstate.commands[j].argc;
        c->argv = c->mstate.commands[j].argv;
        call(c,c->mstate.commands[j].cmd);
    }
    c->argv = orig_argv;
    c->argc = orig_argc;
    freeClientMultiState(c);
    initClientMultiState(c);
    c->flags &= (~REDIS_MULTI);
}

A
antirez 已提交
7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194
/* =========================== Blocking Operations  ========================= */

/* Currently Redis blocking operations support is limited to list POP ops,
 * so the current implementation is not fully generic, but it is also not
 * completely specific so it will not require a rewrite to support new
 * kind of blocking operations in the future.
 *
 * Still it's important to note that list blocking operations can be already
 * used as a notification mechanism in order to implement other blocking
 * operations at application level, so there must be a very strong evidence
 * of usefulness and generality before new blocking operations are implemented.
 *
 * This is how the current blocking POP works, we use BLPOP as example:
 * - If the user calls BLPOP and the key exists and contains a non empty list
 *   then LPOP is called instead. So BLPOP is semantically the same as LPOP
 *   if there is not to block.
 * - If instead BLPOP is called and the key does not exists or the list is
 *   empty we need to block. In order to do so we remove the notification for
 *   new data to read in the client socket (so that we'll not serve new
 *   requests if the blocking request is not served). Also we put the client
7195
 *   in a dictionary (db->blockingkeys) mapping keys to a list of clients
A
antirez 已提交
7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207
 *   blocking for this keys.
 * - If a PUSH operation against a key with blocked clients waiting is
 *   performed, we serve the first in the list: basically instead to push
 *   the new element inside the list we return it to the (first / oldest)
 *   blocking client, unblock the client, and remove it form the list.
 *
 * The above comment and the source code should be enough in order to understand
 * the implementation and modify / fix it later.
 */

/* Set a client in blocking mode for the specified key, with the specified
 * timeout */
7208
static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
A
antirez 已提交
7209 7210
    dictEntry *de;
    list *l;
7211
    int j;
A
antirez 已提交
7212

7213 7214
    c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
    c->blockingkeysnum = numkeys;
A
antirez 已提交
7215
    c->blockingto = timeout;
7216 7217 7218 7219
    for (j = 0; j < numkeys; j++) {
        /* Add the key in the client structure, to map clients -> keys */
        c->blockingkeys[j] = keys[j];
        incrRefCount(keys[j]);
A
antirez 已提交
7220

7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234
        /* And in the other "side", to map keys -> clients */
        de = dictFind(c->db->blockingkeys,keys[j]);
        if (de == NULL) {
            int retval;

            /* For every key we take a list of clients blocked for it */
            l = listCreate();
            retval = dictAdd(c->db->blockingkeys,keys[j],l);
            incrRefCount(keys[j]);
            assert(retval == DICT_OK);
        } else {
            l = dictGetEntryVal(de);
        }
        listAddNodeTail(l,c);
A
antirez 已提交
7235
    }
7236
    /* Mark the client as a blocked client */
A
antirez 已提交
7237
    c->flags |= REDIS_BLOCKED;
A
antirez 已提交
7238
    server.blpop_blocked_clients++;
A
antirez 已提交
7239 7240 7241
}

/* Unblock a client that's waiting in a blocking operation such as BLPOP */
7242
static void unblockClientWaitingData(redisClient *c) {
A
antirez 已提交
7243 7244
    dictEntry *de;
    list *l;
7245
    int j;
A
antirez 已提交
7246

7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262
    assert(c->blockingkeys != NULL);
    /* The client may wait for multiple keys, so unblock it for every key. */
    for (j = 0; j < c->blockingkeysnum; j++) {
        /* Remove this client from the list of clients waiting for this key. */
        de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
        assert(de != NULL);
        l = dictGetEntryVal(de);
        listDelNode(l,listSearchKey(l,c));
        /* If the list is empty we need to remove it to avoid wasting memory */
        if (listLength(l) == 0)
            dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
        decrRefCount(c->blockingkeys[j]);
    }
    /* Cleanup the client structure */
    zfree(c->blockingkeys);
    c->blockingkeys = NULL;
A
antirez 已提交
7263
    c->flags &= (~REDIS_BLOCKED);
A
antirez 已提交
7264
    server.blpop_blocked_clients--;
7265
    /* We want to process data if there is some command waiting
7266 7267 7268 7269
     * in the input buffer. Note that this is safe even if
     * unblockClientWaitingData() gets called from freeClient() because
     * freeClient() will be smart enough to call this function
     * *after* c->querybuf was set to NULL. */
A
antirez 已提交
7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295
    if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
}

/* This should be called from any function PUSHing into lists.
 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
 * 'ele' is the element pushed.
 *
 * If the function returns 0 there was no client waiting for a list push
 * against this key.
 *
 * If the function returns 1 there was a client waiting for a list push
 * against this key, the element was passed to this client thus it's not
 * needed to actually add it to the list and the caller should return asap. */
static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
    struct dictEntry *de;
    redisClient *receiver;
    list *l;
    listNode *ln;

    de = dictFind(c->db->blockingkeys,key);
    if (de == NULL) return 0;
    l = dictGetEntryVal(de);
    ln = listFirst(l);
    assert(ln != NULL);
    receiver = ln->value;

7296
    addReplySds(receiver,sdsnew("*2\r\n"));
A
antirez 已提交
7297 7298
    addReplyBulk(receiver,key);
    addReplyBulk(receiver,ele);
7299
    unblockClientWaitingData(receiver);
A
antirez 已提交
7300 7301 7302 7303 7304 7305 7306
    return 1;
}

/* Blocking RPOP/LPOP */
static void blockingPopGenericCommand(redisClient *c, int where) {
    robj *o;
    time_t timeout;
7307
    int j;
A
antirez 已提交
7308

7309 7310 7311 7312 7313
    for (j = 1; j < c->argc-1; j++) {
        o = lookupKeyWrite(c->db,c->argv[j]);
        if (o != NULL) {
            if (o->type != REDIS_LIST) {
                addReply(c,shared.wrongtypeerr);
A
antirez 已提交
7314
                return;
7315 7316 7317 7318 7319 7320 7321
            } else {
                list *list = o->ptr;
                if (listLength(list) != 0) {
                    /* If the list contains elements fall back to the usual
                     * non-blocking POP operation */
                    robj *argv[2], **orig_argv;
                    int orig_argc;
A
Alex McHale 已提交
7322

7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336
                    /* We need to alter the command arguments before to call
                     * popGenericCommand() as the command takes a single key. */
                    orig_argv = c->argv;
                    orig_argc = c->argc;
                    argv[1] = c->argv[j];
                    c->argv = argv;
                    c->argc = 2;

                    /* Also the return value is different, we need to output
                     * the multi bulk reply header and the key name. The
                     * "real" command will add the last element (the value)
                     * for us. If this souds like an hack to you it's just
                     * because it is... */
                    addReplySds(c,sdsnew("*2\r\n"));
A
antirez 已提交
7337
                    addReplyBulk(c,argv[1]);
7338 7339 7340 7341 7342 7343 7344
                    popGenericCommand(c,where);

                    /* Fix the client structure with the original stuff */
                    c->argv = orig_argv;
                    c->argc = orig_argc;
                    return;
                }
A
antirez 已提交
7345 7346 7347 7348
            }
        }
    }
    /* If the list is empty or the key does not exists we must block */
7349
    timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
A
antirez 已提交
7350
    if (timeout > 0) timeout += time(NULL);
7351
    blockForKeys(c,c->argv+1,c->argc-2,timeout);
A
antirez 已提交
7352 7353 7354 7355 7356 7357 7358 7359 7360 7361
}

static void blpopCommand(redisClient *c) {
    blockingPopGenericCommand(c,REDIS_HEAD);
}

static void brpopCommand(redisClient *c) {
    blockingPopGenericCommand(c,REDIS_TAIL);
}

A
antirez 已提交
7362 7363
/* =============================== Replication  ============================= */

A
antirez 已提交
7364
static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
A
antirez 已提交
7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383
    ssize_t nwritten, ret = size;
    time_t start = time(NULL);

    timeout++;
    while(size) {
        if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
            nwritten = write(fd,ptr,size);
            if (nwritten == -1) return -1;
            ptr += nwritten;
            size -= nwritten;
        }
        if ((time(NULL)-start) > timeout) {
            errno = ETIMEDOUT;
            return -1;
        }
    }
    return ret;
}

A
antirez 已提交
7384
static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
A
antirez 已提交
7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426
    ssize_t nread, totread = 0;
    time_t start = time(NULL);

    timeout++;
    while(size) {
        if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
            nread = read(fd,ptr,size);
            if (nread == -1) return -1;
            ptr += nread;
            size -= nread;
            totread += nread;
        }
        if ((time(NULL)-start) > timeout) {
            errno = ETIMEDOUT;
            return -1;
        }
    }
    return totread;
}

static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
    ssize_t nread = 0;

    size--;
    while(size) {
        char c;

        if (syncRead(fd,&c,1,timeout) == -1) return -1;
        if (c == '\n') {
            *ptr = '\0';
            if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
            return nread;
        } else {
            *ptr++ = c;
            *ptr = '\0';
            nread++;
        }
    }
    return nread;
}

static void syncCommand(redisClient *c) {
7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441
    /* ignore SYNC if aleady slave or in monitor mode */
    if (c->flags & REDIS_SLAVE) return;

    /* SYNC can't be issued when the server has pending data to send to
     * the client about already issued commands. We need a fresh reply
     * buffer registering the differences between the BGSAVE and the current
     * dataset, so that we can copy to other slaves if needed. */
    if (listLength(c->reply) != 0) {
        addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
        return;
    }

    redisLog(REDIS_NOTICE,"Slave ask for synchronization");
    /* Here we need to check if there is a background saving operation
     * in progress, or if it is required to start one */
7442
    if (server.bgsavechildpid != -1) {
7443 7444 7445 7446 7447
        /* Ok a background save is in progress. Let's check if it is a good
         * one for replication, i.e. if there is another slave that is
         * registering differences since the server forked to save */
        redisClient *slave;
        listNode *ln;
A
antirez 已提交
7448
        listIter li;
7449

A
antirez 已提交
7450 7451
        listRewind(server.slaves,&li);
        while((ln = listNext(&li))) {
7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477
            slave = ln->value;
            if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
        }
        if (ln) {
            /* Perfect, the server is already registering differences for
             * another slave. Set the right state, and copy the buffer. */
            listRelease(c->reply);
            c->reply = listDup(slave->reply);
            c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
            redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
        } else {
            /* No way, we need to wait for the next BGSAVE in order to
             * register differences */
            c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
            redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
        }
    } else {
        /* Ok we don't have a BGSAVE in progress, let's start one */
        redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
        if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
            redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
            addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
            return;
        }
        c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
    }
7478
    c->repldbfd = -1;
7479 7480
    c->flags |= REDIS_SLAVE;
    c->slaveseldb = 0;
7481
    listAddNodeTail(server.slaves,c);
7482 7483 7484
    return;
}

7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517
static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
    redisClient *slave = privdata;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);
    char buf[REDIS_IOBUF_LEN];
    ssize_t nwritten, buflen;

    if (slave->repldboff == 0) {
        /* Write the bulk write count before to transfer the DB. In theory here
         * we don't know how much room there is in the output buffer of the
         * socket, but in pratice SO_SNDLOWAT (the minimum count for output
         * operations) will never be smaller than the few bytes we need. */
        sds bulkcount;

        bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
            slave->repldbsize);
        if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
        {
            sdsfree(bulkcount);
            freeClient(slave);
            return;
        }
        sdsfree(bulkcount);
    }
    lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
    buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
    if (buflen <= 0) {
        redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
            (buflen == 0) ? "premature EOF" : strerror(errno));
        freeClient(slave);
        return;
    }
    if ((nwritten = write(fd,buf,buflen)) == -1) {
7518
        redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529
            strerror(errno));
        freeClient(slave);
        return;
    }
    slave->repldboff += nwritten;
    if (slave->repldboff == slave->repldbsize) {
        close(slave->repldbfd);
        slave->repldbfd = -1;
        aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
        slave->replstate = REDIS_REPL_ONLINE;
        if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
7530
            sendReplyToClient, slave) == AE_ERR) {
7531 7532 7533 7534 7535 7536 7537
            freeClient(slave);
            return;
        }
        addReplySds(slave,sdsempty());
        redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
    }
}
A
antirez 已提交
7538

7539 7540 7541 7542 7543 7544 7545
/* This function is called at the end of every backgrond saving.
 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
 * otherwise REDIS_ERR is passed to the function.
 *
 * The goal of this function is to handle slaves waiting for a successful
 * background saving in order to perform non-blocking synchronization. */
static void updateSlavesWaitingBgsave(int bgsaveerr) {
7546 7547
    listNode *ln;
    int startbgsave = 0;
A
antirez 已提交
7548
    listIter li;
A
antirez 已提交
7549

A
antirez 已提交
7550 7551
    listRewind(server.slaves,&li);
    while((ln = listNext(&li))) {
7552
        redisClient *slave = ln->value;
A
antirez 已提交
7553

7554 7555 7556 7557
        if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
            startbgsave = 1;
            slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
        } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
7558
            struct redis_stat buf;
A
Alex McHale 已提交
7559

7560 7561 7562 7563 7564 7565
            if (bgsaveerr != REDIS_OK) {
                freeClient(slave);
                redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
                continue;
            }
            if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
7566
                redis_fstat(slave->repldbfd,&buf) == -1) {
7567 7568 7569 7570 7571 7572 7573 7574
                freeClient(slave);
                redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
                continue;
            }
            slave->repldboff = 0;
            slave->repldbsize = buf.st_size;
            slave->replstate = REDIS_REPL_SEND_BULK;
            aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7575
            if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
7576 7577 7578 7579
                freeClient(slave);
                continue;
            }
        }
A
antirez 已提交
7580
    }
7581 7582
    if (startbgsave) {
        if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
A
antirez 已提交
7583 7584 7585
            listIter li;

            listRewind(server.slaves,&li);
7586
            redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
A
antirez 已提交
7587
            while((ln = listNext(&li))) {
7588
                redisClient *slave = ln->value;
A
antirez 已提交
7589

7590 7591 7592 7593 7594
                if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
                    freeClient(slave);
            }
        }
    }
A
antirez 已提交
7595 7596 7597
}

static int syncWithMaster(void) {
7598
    char buf[1024], tmpfile[256], authcmd[1024];
7599
    long dumpsize;
A
antirez 已提交
7600
    int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
7601
    int dfd, maxtries = 5;
A
antirez 已提交
7602 7603 7604 7605 7606 7607

    if (fd == -1) {
        redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
            strerror(errno));
        return REDIS_ERR;
    }
7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631

    /* AUTH with the master if required. */
    if(server.masterauth) {
    	snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
    	if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
            close(fd);
            redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
                strerror(errno));
            return REDIS_ERR;
    	}
        /* Read the AUTH result.  */
        if (syncReadLine(fd,buf,1024,3600) == -1) {
            close(fd);
            redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
                strerror(errno));
            return REDIS_ERR;
        }
        if (buf[0] != '+') {
            close(fd);
            redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
            return REDIS_ERR;
        }
    }

A
antirez 已提交
7632 7633 7634 7635 7636 7637 7638 7639
    /* Issue the SYNC command */
    if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
        close(fd);
        redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
            strerror(errno));
        return REDIS_ERR;
    }
    /* Read the bulk write count */
7640
    if (syncReadLine(fd,buf,1024,3600) == -1) {
A
antirez 已提交
7641 7642 7643 7644 7645
        close(fd);
        redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
            strerror(errno));
        return REDIS_ERR;
    }
7646 7647 7648 7649 7650
    if (buf[0] != '$') {
        close(fd);
        redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
        return REDIS_ERR;
    }
7651 7652
    dumpsize = strtol(buf+1,NULL,10);
    redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
A
antirez 已提交
7653
    /* Read the bulk write data on a temp file */
7654 7655 7656 7657 7658
    while(maxtries--) {
        snprintf(tmpfile,256,
            "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
        dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
        if (dfd != -1) break;
7659
        sleep(1);
7660
    }
A
antirez 已提交
7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693
    if (dfd == -1) {
        close(fd);
        redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
        return REDIS_ERR;
    }
    while(dumpsize) {
        int nread, nwritten;

        nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
        if (nread == -1) {
            redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
                strerror(errno));
            close(fd);
            close(dfd);
            return REDIS_ERR;
        }
        nwritten = write(dfd,buf,nread);
        if (nwritten == -1) {
            redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
            close(fd);
            close(dfd);
            return REDIS_ERR;
        }
        dumpsize -= nread;
    }
    close(dfd);
    if (rename(tmpfile,server.dbfilename) == -1) {
        redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
        unlink(tmpfile);
        close(fd);
        return REDIS_ERR;
    }
    emptyDb();
7694
    if (rdbLoad(server.dbfilename) != REDIS_OK) {
A
antirez 已提交
7695 7696 7697 7698 7699 7700
        redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
        close(fd);
        return REDIS_ERR;
    }
    server.master = createClient(fd);
    server.master->flags |= REDIS_MASTER;
7701
    server.master->authenticated = 1;
A
antirez 已提交
7702 7703 7704 7705
    server.replstate = REDIS_REPL_CONNECTED;
    return REDIS_OK;
}

7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727
static void slaveofCommand(redisClient *c) {
    if (!strcasecmp(c->argv[1]->ptr,"no") &&
        !strcasecmp(c->argv[2]->ptr,"one")) {
        if (server.masterhost) {
            sdsfree(server.masterhost);
            server.masterhost = NULL;
            if (server.master) freeClient(server.master);
            server.replstate = REDIS_REPL_NONE;
            redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
        }
    } else {
        sdsfree(server.masterhost);
        server.masterhost = sdsdup(c->argv[1]->ptr);
        server.masterport = atoi(c->argv[2]->ptr);
        if (server.master) freeClient(server.master);
        server.replstate = REDIS_REPL_CONNECT;
        redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
            server.masterhost, server.masterport);
    }
    addReply(c,shared.ok);
}

A
antirez 已提交
7728 7729
/* ============================ Maxmemory directive  ======================== */

7730 7731 7732 7733 7734
/* Try to free one object form the pre-allocated objects free list.
 * This is useful under low mem conditions as by default we take 1 million
 * free objects allocated. On success REDIS_OK is returned, otherwise
 * REDIS_ERR. */
static int tryFreeOneObjectFromFreelist(void) {
7735 7736
    robj *o;

7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748
    if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
    if (listLength(server.objfreelist)) {
        listNode *head = listFirst(server.objfreelist);
        o = listNodeValue(head);
        listDelNode(server.objfreelist,head);
        if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
        zfree(o);
        return REDIS_OK;
    } else {
        if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
        return REDIS_ERR;
    }
7749 7750
}

A
antirez 已提交
7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763
/* This function gets called when 'maxmemory' is set on the config file to limit
 * the max memory used by the server, and we are out of memory.
 * This function will try to, in order:
 *
 * - Free objects from the free list
 * - Try to remove keys with an EXPIRE set
 *
 * It is not possible to free enough memory to reach used-memory < maxmemory
 * the server will start refusing commands that will enlarge even more the
 * memory usage.
 */
static void freeMemoryIfNeeded(void) {
    while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783
        int j, k, freed = 0;

        if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
        for (j = 0; j < server.dbnum; j++) {
            int minttl = -1;
            robj *minkey = NULL;
            struct dictEntry *de;

            if (dictSize(server.db[j].expires)) {
                freed = 1;
                /* From a sample of three keys drop the one nearest to
                 * the natural expire */
                for (k = 0; k < 3; k++) {
                    time_t t;

                    de = dictGetRandomKey(server.db[j].expires);
                    t = (time_t) dictGetEntryVal(de);
                    if (minttl == -1 || t < minttl) {
                        minkey = dictGetEntryKey(de);
                        minttl = t;
A
antirez 已提交
7784 7785
                    }
                }
7786
                deleteKey(server.db+j,minkey);
A
antirez 已提交
7787 7788
            }
        }
7789
        if (!freed) return; /* nothing to free... */
A
antirez 已提交
7790 7791 7792
    }
}

7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807
/* ============================== Append Only file ========================== */

static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();
    int j;
    ssize_t nwritten;
    time_t now;
    robj *tmpargv[3];

    /* The DB this command was targetting is not the same as the last command
     * we appendend. To issue a SELECT command is needed. */
    if (dictid != server.appendseldb) {
        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);
7808
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
A
antirez 已提交
7809
            (unsigned long)strlen(seldb),seldb);
7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831
        server.appendseldb = dictid;
    }

    /* "Fix" the argv vector if the command is EXPIRE. We want to translate
     * EXPIREs into EXPIREATs calls */
    if (cmd->proc == expireCommand) {
        long when;

        tmpargv[0] = createStringObject("EXPIREAT",8);
        tmpargv[1] = argv[1];
        incrRefCount(argv[1]);
        when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
        tmpargv[2] = createObject(REDIS_STRING,
            sdscatprintf(sdsempty(),"%ld",when));
        argv = tmpargv;
    }

    /* Append the actual command */
    buf = sdscatprintf(buf,"*%d\r\n",argc);
    for (j = 0; j < argc; j++) {
        robj *o = argv[j];

7832
        o = getDecodedObject(o);
A
antirez 已提交
7833
        buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
7834 7835
        buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
        buf = sdscatlen(buf,"\r\n",2);
7836
        decrRefCount(o);
7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861
    }

    /* Free the objects from the modified argv for EXPIREAT */
    if (cmd->proc == expireCommand) {
        for (j = 0; j < 3; j++)
            decrRefCount(argv[j]);
    }

    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike */
     nwritten = write(server.appendfd,buf,sdslen(buf));
     if (nwritten != (signed)sdslen(buf)) {
        /* Ooops, we are in troubles. The best thing to do for now is
         * to simply exit instead to give the illusion that everything is
         * working as expected. */
         if (nwritten == -1) {
            redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
         } else {
            redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
         }
         exit(1);
    }
A
antirez 已提交
7862 7863 7864 7865 7866 7867 7868 7869
    /* If a background append only file rewriting is in progress we want to
     * accumulate the differences between the child DB and the current one
     * in a buffer, so that when the child process will do its work we
     * can append the differences to the new append only file. */
    if (server.bgrewritechildpid != -1)
        server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));

    sdsfree(buf);
7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890
    now = time(NULL);
    if (server.appendfsync == APPENDFSYNC_ALWAYS ||
        (server.appendfsync == APPENDFSYNC_EVERYSEC &&
         now-server.lastfsync > 1))
    {
        fsync(server.appendfd); /* Let's try to get this data on the disk */
        server.lastfsync = now;
    }
}

/* In Redis commands are always executed in the context of a client, so in
 * order to load the append only file we need to create a fake client. */
static struct redisClient *createFakeClient(void) {
    struct redisClient *c = zmalloc(sizeof(*c));

    selectDb(c,0);
    c->fd = -1;
    c->querybuf = sdsempty();
    c->argc = 0;
    c->argv = NULL;
    c->flags = 0;
A
antirez 已提交
7891 7892 7893
    /* We set the fake client as a slave waiting for the synchronization
     * so that Redis will not try to send replies to this client. */
    c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912
    c->reply = listCreate();
    listSetFreeMethod(c->reply,decrRefCount);
    listSetDupMethod(c->reply,dupClientReplyValue);
    return c;
}

static void freeFakeClient(struct redisClient *c) {
    sdsfree(c->querybuf);
    listRelease(c->reply);
    zfree(c);
}

/* Replay the append log file. On error REDIS_OK is returned. On non fatal
 * error (the append only file is zero-length) REDIS_ERR is returned. On
 * fatal error an error message is logged and the program exists. */
int loadAppendOnlyFile(char *filename) {
    struct redisClient *fakeClient;
    FILE *fp = fopen(filename,"r");
    struct redis_stat sb;
7913
    unsigned long long loadedkeys = 0;
7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945

    if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
        return REDIS_ERR;

    if (fp == NULL) {
        redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
        exit(1);
    }

    fakeClient = createFakeClient();
    while(1) {
        int argc, j;
        unsigned long len;
        robj **argv;
        char buf[128];
        sds argsds;
        struct redisCommand *cmd;

        if (fgets(buf,sizeof(buf),fp) == NULL) {
            if (feof(fp))
                break;
            else
                goto readerr;
        }
        if (buf[0] != '*') goto fmterr;
        argc = atoi(buf+1);
        argv = zmalloc(sizeof(robj*)*argc);
        for (j = 0; j < argc; j++) {
            if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
            if (buf[0] != '$') goto fmterr;
            len = strtol(buf+1,NULL,10);
            argsds = sdsnewlen(NULL,len);
7946
            if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
7947 7948 7949 7950 7951 7952 7953 7954 7955 7956
            argv[j] = createObject(REDIS_STRING,argsds);
            if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
        }

        /* Command lookup */
        cmd = lookupCommand(argv[0]->ptr);
        if (!cmd) {
            redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
            exit(1);
        }
7957
        /* Try object encoding */
7958
        if (cmd->flags & REDIS_CMD_BULK)
7959
            argv[argc-1] = tryObjectEncoding(argv[argc-1]);
7960 7961 7962 7963 7964 7965 7966 7967 7968 7969
        /* Run the command in the context of a fake client */
        fakeClient->argc = argc;
        fakeClient->argv = argv;
        cmd->proc(fakeClient);
        /* Discard the reply objects list from the fake client */
        while(listLength(fakeClient->reply))
            listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
        /* Clean up, ready for the next command */
        for (j = 0; j < argc; j++) decrRefCount(argv[j]);
        zfree(argv);
7970 7971 7972 7973
        /* Handle swapping while loading big datasets when VM is on */
        loadedkeys++;
        if (server.vm_enabled && (loadedkeys % 5000) == 0) {
            while (zmalloc_used_memory() > server.vm_max_memory) {
7974
                if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
7975 7976
            }
        }
7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993
    }
    fclose(fp);
    freeFakeClient(fakeClient);
    return REDIS_OK;

readerr:
    if (feof(fp)) {
        redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
    } else {
        redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
    }
    exit(1);
fmterr:
    redisLog(REDIS_WARNING,"Bad file format reading the append only file");
    exit(1);
}

7994
/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
A
antirez 已提交
7995
static int fwriteBulkObject(FILE *fp, robj *obj) {
7996
    char buf[128];
7997 7998
    int decrrc = 0;

A
antirez 已提交
7999 8000 8001 8002 8003 8004
    /* Avoid the incr/decr ref count business if possible to help
     * copy-on-write (we are often in a child process when this function
     * is called).
     * Also makes sure that key objects don't get incrRefCount-ed when VM
     * is enabled */
    if (obj->encoding != REDIS_ENCODING_RAW) {
8005 8006 8007
        obj = getDecodedObject(obj);
        decrrc = 1;
    }
8008 8009
    snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
    if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
8010 8011
    if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
        goto err;
8012
    if (fwrite("\r\n",2,1,fp) == 0) goto err;
8013
    if (decrrc) decrRefCount(obj);
8014 8015
    return 1;
err:
8016
    if (decrrc) decrRefCount(obj);
8017 8018 8019
    return 0;
}

A
antirez 已提交
8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031
/* Write binary-safe string into a file in the bulkformat
 * $<count>\r\n<payload>\r\n */
static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
    char buf[128];

    snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
    if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
    if (len && fwrite(s,len,1,fp) == 0) return 0;
    if (fwrite("\r\n",2,1,fp) == 0) return 0;
    return 1;
}

8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084
/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
static int fwriteBulkDouble(FILE *fp, double d) {
    char buf[128], dbuf[128];

    snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
    snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
    if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
    if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
    return 1;
}

/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
static int fwriteBulkLong(FILE *fp, long l) {
    char buf[128], lbuf[128];

    snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
    snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
    if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
    if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
    return 1;
}

/* Write a sequence of commands able to fully rebuild the dataset into
 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
static int rewriteAppendOnlyFile(char *filename) {
    dictIterator *di = NULL;
    dictEntry *de;
    FILE *fp;
    char tmpfile[256];
    int j;
    time_t now = time(NULL);

    /* Note that we have to use a different temp name here compared to the
     * one used by rewriteAppendOnlyFileBackground() function. */
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    fp = fopen(tmpfile,"w");
    if (!fp) {
        redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
        return REDIS_ERR;
    }
    for (j = 0; j < server.dbnum; j++) {
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
        redisDb *db = server.db+j;
        dict *d = db->dict;
        if (dictSize(d) == 0) continue;
        di = dictGetIterator(d);
        if (!di) {
            fclose(fp);
            return REDIS_ERR;
        }

        /* SELECT the new DB */
        if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
A
antirez 已提交
8085
        if (fwriteBulkLong(fp,j) == 0) goto werr;
8086 8087 8088

        /* Iterate this DB writing every entry */
        while((de = dictNext(di)) != NULL) {
8089 8090 8091 8092 8093
            robj *key, *o;
            time_t expiretime;
            int swapped;

            key = dictGetEntryKey(de);
8094 8095 8096 8097
            /* If the value for this key is swapped, load a preview in memory.
             * We use a "swapped" flag to remember if we need to free the
             * value object instead to just increment the ref count anyway
             * in order to avoid copy-on-write of pages if we are forked() */
8098 8099
            if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
                key->storage == REDIS_VM_SWAPPING) {
8100 8101 8102 8103 8104 8105 8106
                o = dictGetEntryVal(de);
                swapped = 0;
            } else {
                o = vmPreviewObject(key);
                swapped = 1;
            }
            expiretime = getExpire(db,key);
8107 8108 8109 8110 8111 8112 8113

            /* Save the key and associated value */
            if (o->type == REDIS_STRING) {
                /* Emit a SET command */
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
                /* Key and value */
A
antirez 已提交
8114 8115
                if (fwriteBulkObject(fp,key) == 0) goto werr;
                if (fwriteBulkObject(fp,o) == 0) goto werr;
8116 8117 8118 8119
            } else if (o->type == REDIS_LIST) {
                /* Emit the RPUSHes needed to rebuild the list */
                list *list = o->ptr;
                listNode *ln;
A
antirez 已提交
8120
                listIter li;
8121

A
antirez 已提交
8122 8123
                listRewind(list,&li);
                while((ln = listNext(&li))) {
8124 8125 8126 8127
                    char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
                    robj *eleobj = listNodeValue(ln);

                    if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
A
antirez 已提交
8128 8129
                    if (fwriteBulkObject(fp,key) == 0) goto werr;
                    if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141
                }
            } else if (o->type == REDIS_SET) {
                /* Emit the SADDs needed to rebuild the set */
                dict *set = o->ptr;
                dictIterator *di = dictGetIterator(set);
                dictEntry *de;

                while((de = dictNext(di)) != NULL) {
                    char cmd[]="*3\r\n$4\r\nSADD\r\n";
                    robj *eleobj = dictGetEntryKey(de);

                    if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
A
antirez 已提交
8142 8143
                    if (fwriteBulkObject(fp,key) == 0) goto werr;
                    if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157
                }
                dictReleaseIterator(di);
            } else if (o->type == REDIS_ZSET) {
                /* Emit the ZADDs needed to rebuild the sorted set */
                zset *zs = o->ptr;
                dictIterator *di = dictGetIterator(zs->dict);
                dictEntry *de;

                while((de = dictNext(di)) != NULL) {
                    char cmd[]="*4\r\n$4\r\nZADD\r\n";
                    robj *eleobj = dictGetEntryKey(de);
                    double *score = dictGetEntryVal(de);

                    if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
A
antirez 已提交
8158
                    if (fwriteBulkObject(fp,key) == 0) goto werr;
8159
                    if (fwriteBulkDouble(fp,*score) == 0) goto werr;
A
antirez 已提交
8160
                    if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8161 8162
                }
                dictReleaseIterator(di);
A
antirez 已提交
8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194
            } else if (o->type == REDIS_HASH) {
                char cmd[]="*4\r\n$4\r\nHSET\r\n";

                /* Emit the HSETs needed to rebuild the hash */
                if (o->encoding == REDIS_ENCODING_ZIPMAP) {
                    unsigned char *p = zipmapRewind(o->ptr);
                    unsigned char *field, *val;
                    unsigned int flen, vlen;

                    while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
                        if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
                        if (fwriteBulkObject(fp,key) == 0) goto werr;
                        if (fwriteBulkString(fp,(char*)field,flen) == -1)
                            return -1;
                        if (fwriteBulkString(fp,(char*)val,vlen) == -1)
                            return -1;
                    }
                } else {
                    dictIterator *di = dictGetIterator(o->ptr);
                    dictEntry *de;

                    while((de = dictNext(di)) != NULL) {
                        robj *field = dictGetEntryKey(de);
                        robj *val = dictGetEntryVal(de);

                        if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
                        if (fwriteBulkObject(fp,key) == 0) goto werr;
                        if (fwriteBulkObject(fp,field) == -1) return -1;
                        if (fwriteBulkObject(fp,val) == -1) return -1;
                    }
                    dictReleaseIterator(di);
                }
8195
            } else {
A
antirez 已提交
8196
                redisAssert(0);
8197 8198 8199
            }
            /* Save the expire time */
            if (expiretime != -1) {
8200
                char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
8201 8202 8203
                /* If this key is already expired skip it */
                if (expiretime < now) continue;
                if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
A
antirez 已提交
8204
                if (fwriteBulkObject(fp,key) == 0) goto werr;
8205 8206
                if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
            }
8207
            if (swapped) decrRefCount(o);
8208 8209 8210 8211 8212 8213 8214 8215
        }
        dictReleaseIterator(di);
    }

    /* Make sure data will not remain on the OS's output buffers */
    fflush(fp);
    fsync(fileno(fp));
    fclose(fp);
A
Alex McHale 已提交
8216

8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229
    /* Use RENAME to make sure the DB file is changed atomically only
     * if the generate DB file is ok. */
    if (rename(tmpfile,filename) == -1) {
        redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
        unlink(tmpfile);
        return REDIS_ERR;
    }
    redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
    return REDIS_OK;

werr:
    fclose(fp);
    unlink(tmpfile);
8230
    redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250
    if (di) dictReleaseIterator(di);
    return REDIS_ERR;
}

/* This is how rewriting of the append only file in background works:
 *
 * 1) The user calls BGREWRITEAOF
 * 2) Redis calls this function, that forks():
 *    2a) the child rewrite the append only file in a temp file.
 *    2b) the parent accumulates differences in server.bgrewritebuf.
 * 3) When the child finished '2a' exists.
 * 4) The parent will trap the exit code, if it's OK, will append the
 *    data accumulated into server.bgrewritebuf into the temp file, and
 *    finally will rename(2) the temp file in the actual file name.
 *    The the new file is reopened as the new append only file. Profit!
 */
static int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;

    if (server.bgrewritechildpid != -1) return REDIS_ERR;
8251
    if (server.vm_enabled) waitEmptyIOJobsQueue();
8252 8253 8254 8255
    if ((childpid = fork()) == 0) {
        /* Child */
        char tmpfile[256];

8256 8257
        if (server.vm_enabled) vmReopenSwapFile();
        close(server.fd);
8258 8259
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
8260
            _exit(0);
8261
        } else {
8262
            _exit(1);
8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274
        }
    } else {
        /* Parent */
        if (childpid == -1) {
            redisLog(REDIS_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            return REDIS_ERR;
        }
        redisLog(REDIS_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);
        server.bgrewritechildpid = childpid;
8275
        updateDictResizePolicy();
A
antirez 已提交
8276 8277 8278 8279 8280
        /* We set appendseldb to -1 in order to force the next call to the
         * feedAppendOnlyFile() to issue a SELECT command, so the differences
         * accumulated by the parent into server.bgrewritebuf will start
         * with a SELECT statement and it will be safe to merge. */
        server.appendseldb = -1;
8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291
        return REDIS_OK;
    }
    return REDIS_OK; /* unreached */
}

static void bgrewriteaofCommand(redisClient *c) {
    if (server.bgrewritechildpid != -1) {
        addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
        return;
    }
    if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
8292 8293
        char *status = "+Background append only file rewriting started\r\n";
        addReplySds(c,sdsnew(status));
8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305
    } else {
        addReply(c,shared.err);
    }
}

static void aofRemoveTempFile(pid_t childpid) {
    char tmpfile[256];

    snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
    unlink(tmpfile);
}

8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327
/* Virtual Memory is composed mainly of two subsystems:
 * - Blocking Virutal Memory
 * - Threaded Virtual Memory I/O
 * The two parts are not fully decoupled, but functions are split among two
 * different sections of the source code (delimited by comments) in order to
 * make more clear what functionality is about the blocking VM and what about
 * the threaded (not blocking) VM.
 *
 * Redis VM design:
 *
 * Redis VM is a blocking VM (one that blocks reading swapped values from
 * disk into memory when a value swapped out is needed in memory) that is made
 * unblocking by trying to examine the command argument vector in order to
 * load in background values that will likely be needed in order to exec
 * the command. The command is executed only once all the relevant keys
 * are loaded into memory.
 *
 * This basically is almost as simple of a blocking VM, but almost as parallel
 * as a fully non-blocking VM.
 */

/* =================== Virtual Memory - Blocking Side  ====================== */
8328 8329 8330 8331 8332 8333

/* substitute the first occurrence of '%p' with the process pid in the
 * swap file name. */
static void expandVmSwapFilename(void) {
    char *p = strstr(server.vm_swap_file,"%p");
    sds new;
A
Alex McHale 已提交
8334

8335 8336 8337 8338 8339 8340 8341 8342 8343 8344
    if (!p) return;
    new = sdsempty();
    *p = '\0';
    new = sdscat(new,server.vm_swap_file);
    new = sdscatprintf(new,"%ld",(long) getpid());
    new = sdscat(new,p+2);
    zfree(server.vm_swap_file);
    server.vm_swap_file = new;
}

8345 8346
static void vmInit(void) {
    off_t totsize;
8347
    int pipefds[2];
8348
    size_t stacksize;
8349

8350 8351 8352
    if (server.vm_max_threads != 0)
        zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */

8353 8354
    expandVmSwapFilename();
    redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
A
antirez 已提交
8355 8356 8357
    if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
        server.vm_fp = fopen(server.vm_swap_file,"w+b");
    }
8358
    if (server.vm_fp == NULL) {
A
antirez 已提交
8359 8360 8361
        redisLog(REDIS_WARNING,
            "Impossible to open the swap file: %s. Exiting.",
            strerror(errno));
8362 8363 8364 8365 8366
        exit(1);
    }
    server.vm_fd = fileno(server.vm_fp);
    server.vm_next_page = 0;
    server.vm_near_pages = 0;
A
antirez 已提交
8367 8368 8369 8370
    server.vm_stats_used_pages = 0;
    server.vm_stats_swapped_objects = 0;
    server.vm_stats_swapouts = 0;
    server.vm_stats_swapins = 0;
8371 8372 8373 8374 8375 8376 8377 8378 8379
    totsize = server.vm_pages*server.vm_page_size;
    redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
    if (ftruncate(server.vm_fd,totsize) == -1) {
        redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
            strerror(errno));
        exit(1);
    } else {
        redisLog(REDIS_NOTICE,"Swap file allocated with success");
    }
A
antirez 已提交
8380
    server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
8381
    redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
A
antirez 已提交
8382
        (long long) (server.vm_pages+7)/8, server.vm_pages);
A
antirez 已提交
8383
    memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
8384

8385 8386 8387 8388
    /* Initialize threaded I/O (used by Virtual Memory) */
    server.io_newjobs = listCreate();
    server.io_processing = listCreate();
    server.io_processed = listCreate();
A
antirez 已提交
8389
    server.io_ready_clients = listCreate();
8390
    pthread_mutex_init(&server.io_mutex,NULL);
8391 8392
    pthread_mutex_init(&server.obj_freelist_mutex,NULL);
    pthread_mutex_init(&server.io_swapfile_mutex,NULL);
8393
    server.io_active_threads = 0;
8394 8395 8396 8397 8398 8399 8400 8401
    if (pipe(pipefds) == -1) {
        redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
            ,strerror(errno));
        exit(1);
    }
    server.io_ready_pipe_read = pipefds[0];
    server.io_ready_pipe_write = pipefds[1];
    redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
8402 8403 8404 8405 8406
    /* LZF requires a lot of stack */
    pthread_attr_init(&server.io_threads_attr);
    pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
    while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
    pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
8407 8408 8409 8410
    /* Listen for events in the threaded I/O pipe */
    if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
        vmThreadedIOCompletedJob, NULL) == AE_ERR)
        oom("creating file event");
8411 8412
}

A
antirez 已提交
8413 8414 8415 8416
/* Mark the page as used */
static void vmMarkPageUsed(off_t page) {
    off_t byte = page/8;
    int bit = page&7;
8417
    redisAssert(vmFreePage(page) == 1);
A
antirez 已提交
8418 8419 8420 8421 8422 8423 8424 8425
    server.vm_bitmap[byte] |= 1<<bit;
}

/* Mark N contiguous pages as used, with 'page' being the first. */
static void vmMarkPagesUsed(off_t page, off_t count) {
    off_t j;

    for (j = 0; j < count; j++)
A
antirez 已提交
8426
        vmMarkPageUsed(page+j);
A
antirez 已提交
8427
    server.vm_stats_used_pages += count;
8428 8429
    redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
        (long long)count, (long long)page);
A
antirez 已提交
8430 8431 8432 8433 8434 8435
}

/* Mark the page as free */
static void vmMarkPageFree(off_t page) {
    off_t byte = page/8;
    int bit = page&7;
8436
    redisAssert(vmFreePage(page) == 0);
A
antirez 已提交
8437 8438 8439 8440 8441 8442 8443 8444
    server.vm_bitmap[byte] &= ~(1<<bit);
}

/* Mark N contiguous pages as free, with 'page' being the first. */
static void vmMarkPagesFree(off_t page, off_t count) {
    off_t j;

    for (j = 0; j < count; j++)
A
antirez 已提交
8445
        vmMarkPageFree(page+j);
A
antirez 已提交
8446
    server.vm_stats_used_pages -= count;
8447 8448
    redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
        (long long)count, (long long)page);
A
antirez 已提交
8449 8450 8451 8452 8453 8454
}

/* Test if the page is free */
static int vmFreePage(off_t page) {
    off_t byte = page/8;
    int bit = page&7;
A
antirez 已提交
8455
    return (server.vm_bitmap[byte] & (1<<bit)) == 0;
A
antirez 已提交
8456 8457 8458
}

/* Find N contiguous free pages storing the first page of the cluster in *first.
A
Alex McHale 已提交
8459
 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
A
antirez 已提交
8460
 * REDIS_ERR is returned.
A
antirez 已提交
8461 8462 8463 8464 8465 8466 8467 8468 8469
 *
 * This function uses a simple algorithm: we try to allocate
 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
 * again from the start of the swap file searching for free spaces.
 *
 * If it looks pretty clear that there are no free pages near our offset
 * we try to find less populated places doing a forward jump of
 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
 * without hurry, and then we jump again and so forth...
A
Alex McHale 已提交
8470
 *
A
antirez 已提交
8471 8472 8473 8474 8475 8476
 * This function can be improved using a free list to avoid to guess
 * too much, since we could collect data about freed pages.
 *
 * note: I implemented this function just after watching an episode of
 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
 */
A
antirez 已提交
8477
static int vmFindContiguousPages(off_t *first, off_t n) {
A
antirez 已提交
8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503
    off_t base, offset = 0, since_jump = 0, numfree = 0;

    if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
        server.vm_near_pages = 0;
        server.vm_next_page = 0;
    }
    server.vm_near_pages++; /* Yet another try for pages near to the old ones */
    base = server.vm_next_page;

    while(offset < server.vm_pages) {
        off_t this = base+offset;

        /* If we overflow, restart from page zero */
        if (this >= server.vm_pages) {
            this -= server.vm_pages;
            if (this == 0) {
                /* Just overflowed, what we found on tail is no longer
                 * interesting, as it's no longer contiguous. */
                numfree = 0;
            }
        }
        if (vmFreePage(this)) {
            /* This is a free page */
            numfree++;
            /* Already got N free pages? Return to the caller, with success */
            if (numfree == n) {
A
antirez 已提交
8504 8505
                *first = this-(n-1);
                server.vm_next_page = this+1;
8506
                redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
A
antirez 已提交
8507
                return REDIS_OK;
A
antirez 已提交
8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527
            }
        } else {
            /* The current one is not a free page */
            numfree = 0;
        }

        /* Fast-forward if the current page is not free and we already
         * searched enough near this place. */
        since_jump++;
        if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
            offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
            since_jump = 0;
            /* Note that even if we rewind after the jump, we are don't need
             * to make sure numfree is set to zero as we only jump *if* it
             * is set to zero. */
        } else {
            /* Otherwise just check the next page */
            offset++;
        }
    }
A
antirez 已提交
8528 8529 8530
    return REDIS_ERR;
}

8531 8532 8533 8534 8535 8536
/* Write the specified object at the specified page of the swap file */
static int vmWriteObjectOnSwap(robj *o, off_t page) {
    if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
    if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
        if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
        redisLog(REDIS_WARNING,
8537
            "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
8538 8539 8540 8541
            strerror(errno));
        return REDIS_ERR;
    }
    rdbSaveObject(server.vm_fp,o);
8542
    fflush(server.vm_fp);
8543 8544 8545 8546
    if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
    return REDIS_OK;
}

A
antirez 已提交
8547 8548 8549 8550
/* Swap the 'val' object relative to 'key' into disk. Store all the information
 * needed to later retrieve the object into the key object.
 * If we can't find enough contiguous empty pages to swap the object on disk
 * REDIS_ERR is returned. */
8551
static int vmSwapObjectBlocking(robj *key, robj *val) {
8552
    off_t pages = rdbSavedObjectPages(val,NULL);
A
antirez 已提交
8553 8554 8555
    off_t page;

    assert(key->storage == REDIS_VM_MEMORY);
A
antirez 已提交
8556
    assert(key->refcount == 1);
A
antirez 已提交
8557
    if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
8558
    if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
A
antirez 已提交
8559 8560 8561
    key->vm.page = page;
    key->vm.usedpages = pages;
    key->storage = REDIS_VM_SWAPPED;
8562
    key->vtype = val->type;
A
antirez 已提交
8563 8564
    decrRefCount(val); /* Deallocate the object from memory. */
    vmMarkPagesUsed(page,pages);
A
antirez 已提交
8565 8566 8567
    redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
        (unsigned char*) key->ptr,
        (unsigned long long) page, (unsigned long long) pages);
A
antirez 已提交
8568 8569
    server.vm_stats_swapped_objects++;
    server.vm_stats_swapouts++;
A
antirez 已提交
8570 8571 8572
    return REDIS_OK;
}

8573 8574
static robj *vmReadObjectFromSwap(off_t page, int type) {
    robj *o;
A
antirez 已提交
8575

8576 8577
    if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
    if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
A
antirez 已提交
8578
        redisLog(REDIS_WARNING,
A
antirez 已提交
8579
            "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
A
antirez 已提交
8580
            strerror(errno));
8581
        _exit(1);
A
antirez 已提交
8582
    }
8583 8584
    o = rdbLoadObject(type,server.vm_fp);
    if (o == NULL) {
A
antirez 已提交
8585
        redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
8586
        _exit(1);
A
antirez 已提交
8587
    }
8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599
    if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
    return o;
}

/* Load the value object relative to the 'key' object from swap to memory.
 * The newly allocated object is returned.
 *
 * If preview is true the unserialized object is returned to the caller but
 * no changes are made to the key object, nor the pages are marked as freed */
static robj *vmGenericLoadObject(robj *key, int preview) {
    robj *val;

A
antirez 已提交
8600
    redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
8601
    val = vmReadObjectFromSwap(key->vm.page,key->vtype);
A
antirez 已提交
8602 8603 8604 8605 8606 8607
    if (!preview) {
        key->storage = REDIS_VM_MEMORY;
        key->vm.atime = server.unixtime;
        vmMarkPagesFree(key->vm.page,key->vm.usedpages);
        redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
            (unsigned char*) key->ptr);
A
antirez 已提交
8608
        server.vm_stats_swapped_objects--;
A
antirez 已提交
8609 8610 8611
    } else {
        redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
            (unsigned char*) key->ptr);
A
antirez 已提交
8612
    }
A
antirez 已提交
8613
    server.vm_stats_swapins++;
A
antirez 已提交
8614
    return val;
A
antirez 已提交
8615 8616
}

A
antirez 已提交
8617 8618
/* Plain object loading, from swap to memory */
static robj *vmLoadObject(robj *key) {
8619 8620 8621 8622
    /* If we are loading the object in background, stop it, we
     * need to load this object synchronously ASAP. */
    if (key->storage == REDIS_VM_LOADING)
        vmCancelThreadedIOJob(key);
A
antirez 已提交
8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633
    return vmGenericLoadObject(key,0);
}

/* Just load the value on disk, without to modify the key.
 * This is useful when we want to perform some operation on the value
 * without to really bring it from swap to memory, like while saving the
 * dataset or rewriting the append only log. */
static robj *vmPreviewObject(robj *key) {
    return vmGenericLoadObject(key,1);
}

A
antirez 已提交
8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696
/* How a good candidate is this object for swapping?
 * The better candidate it is, the greater the returned value.
 *
 * Currently we try to perform a fast estimation of the object size in
 * memory, and combine it with aging informations.
 *
 * Basically swappability = idle-time * log(estimated size)
 *
 * Bigger objects are preferred over smaller objects, but not
 * proportionally, this is why we use the logarithm. This algorithm is
 * just a first try and will probably be tuned later. */
static double computeObjectSwappability(robj *o) {
    time_t age = server.unixtime - o->vm.atime;
    long asize = 0;
    list *l;
    dict *d;
    struct dictEntry *de;
    int z;

    if (age <= 0) return 0;
    switch(o->type) {
    case REDIS_STRING:
        if (o->encoding != REDIS_ENCODING_RAW) {
            asize = sizeof(*o);
        } else {
            asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
        }
        break;
    case REDIS_LIST:
        l = o->ptr;
        listNode *ln = listFirst(l);

        asize = sizeof(list);
        if (ln) {
            robj *ele = ln->value;
            long elesize;

            elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
                            (sizeof(*o)+sdslen(ele->ptr)) :
                            sizeof(*o);
            asize += (sizeof(listNode)+elesize)*listLength(l);
        }
        break;
    case REDIS_SET:
    case REDIS_ZSET:
        z = (o->type == REDIS_ZSET);
        d = z ? ((zset*)o->ptr)->dict : o->ptr;

        asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
        if (z) asize += sizeof(zset)-sizeof(dict);
        if (dictSize(d)) {
            long elesize;
            robj *ele;

            de = dictGetRandomKey(d);
            ele = dictGetEntryKey(de);
            elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
                            (sizeof(*o)+sdslen(ele->ptr)) :
                            sizeof(*o);
            asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
            if (z) asize += sizeof(zskiplistNode)*dictSize(d);
        }
        break;
8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728
    case REDIS_HASH:
        if (o->encoding == REDIS_ENCODING_ZIPMAP) {
            unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
            unsigned int len = zipmapLen((unsigned char*)o->ptr);
            unsigned int klen, vlen;
            unsigned char *key, *val;

            if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
                klen = 0;
                vlen = 0;
            }
            asize = len*(klen+vlen+3);
        } else if (o->encoding == REDIS_ENCODING_HT) {
            d = o->ptr;
            asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
            if (dictSize(d)) {
                long elesize;
                robj *ele;

                de = dictGetRandomKey(d);
                ele = dictGetEntryKey(de);
                elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
                                (sizeof(*o)+sdslen(ele->ptr)) :
                                sizeof(*o);
                ele = dictGetEntryVal(de);
                elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
                                (sizeof(*o)+sdslen(ele->ptr)) :
                                sizeof(*o);
                asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
            }
        }
        break;
A
antirez 已提交
8729
    }
8730
    return (double)age*log(1+asize);
A
antirez 已提交
8731 8732 8733 8734
}

/* Try to swap an object that's a good candidate for swapping.
 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
8735 8736 8737 8738 8739
 * to swap any object at all.
 *
 * If 'usethreaded' is true, Redis will try to swap the object in background
 * using I/O threads. */
static int vmSwapOneObject(int usethreads) {
A
antirez 已提交
8740 8741 8742
    int j, i;
    struct dictEntry *best = NULL;
    double best_swappability = 0;
8743
    redisDb *best_db = NULL;
A
antirez 已提交
8744 8745 8746 8747
    robj *key, *val;

    for (j = 0; j < server.dbnum; j++) {
        redisDb *db = server.db+j;
8748 8749 8750
        /* Why maxtries is set to 100?
         * Because this way (usually) we'll find 1 object even if just 1% - 2%
         * are swappable objects */
8751
        int maxtries = 100;
A
antirez 已提交
8752 8753 8754 8755 8756 8757

        if (dictSize(db->dict) == 0) continue;
        for (i = 0; i < 5; i++) {
            dictEntry *de;
            double swappability;

8758
            if (maxtries) maxtries--;
A
antirez 已提交
8759 8760 8761
            de = dictGetRandomKey(db->dict);
            key = dictGetEntryKey(de);
            val = dictGetEntryVal(de);
A
antirez 已提交
8762 8763 8764 8765 8766 8767 8768 8769
            /* Only swap objects that are currently in memory.
             *
             * Also don't swap shared objects if threaded VM is on, as we
             * try to ensure that the main thread does not touch the
             * object while the I/O thread is using it, but we can't
             * control other keys without adding additional mutex. */
            if (key->storage != REDIS_VM_MEMORY ||
                (server.vm_max_threads != 0 && val->refcount != 1)) {
8770 8771 8772
                if (maxtries) i--; /* don't count this try */
                continue;
            }
A
antirez 已提交
8773 8774 8775 8776
            swappability = computeObjectSwappability(val);
            if (!best || swappability > best_swappability) {
                best = de;
                best_swappability = swappability;
8777
                best_db = db;
A
antirez 已提交
8778 8779 8780
            }
        }
    }
8781
    if (best == NULL) return REDIS_ERR;
A
antirez 已提交
8782 8783 8784
    key = dictGetEntryKey(best);
    val = dictGetEntryVal(best);

8785
    redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
A
antirez 已提交
8786 8787 8788 8789 8790 8791 8792 8793 8794
        key->ptr, best_swappability);

    /* Unshare the key if needed */
    if (key->refcount > 1) {
        robj *newkey = dupStringObject(key);
        decrRefCount(key);
        key = dictGetEntryKey(best) = newkey;
    }
    /* Swap it */
8795
    if (usethreads) {
8796
        vmSwapObjectThreaded(key,val,best_db);
A
antirez 已提交
8797 8798
        return REDIS_OK;
    } else {
8799 8800 8801 8802 8803 8804
        if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
            dictGetEntryVal(best) = NULL;
            return REDIS_OK;
        } else {
            return REDIS_ERR;
        }
A
antirez 已提交
8805 8806 8807
    }
}

8808 8809 8810 8811 8812 8813 8814 8815
static int vmSwapOneObjectBlocking() {
    return vmSwapOneObject(0);
}

static int vmSwapOneObjectThreaded() {
    return vmSwapOneObject(1);
}

A
antirez 已提交
8816 8817 8818 8819 8820 8821 8822
/* Return true if it's safe to swap out objects in a given moment.
 * Basically we don't want to swap objects out while there is a BGSAVE
 * or a BGAEOREWRITE running in backgroud. */
static int vmCanSwapOut(void) {
    return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
}

A
antirez 已提交
8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835
/* Delete a key if swapped. Returns 1 if the key was found, was swapped
 * and was deleted. Otherwise 0 is returned. */
static int deleteIfSwapped(redisDb *db, robj *key) {
    dictEntry *de;
    robj *foundkey;

    if ((de = dictFind(db->dict,key)) == NULL) return 0;
    foundkey = dictGetEntryKey(de);
    if (foundkey->storage == REDIS_VM_MEMORY) return 0;
    deleteKey(db,key);
    return 1;
}

8836 8837
/* =================== Virtual Memory - Threaded I/O  ======================= */

8838
static void freeIOJob(iojob *j) {
A
antirez 已提交
8839 8840 8841
    if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
        j->type == REDIS_IOJOB_DO_SWAP ||
        j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
8842
        decrRefCount(j->val);
8843 8844 8845 8846
    /* We don't decrRefCount the j->key field as we did't incremented
     * the count creating IO Jobs. This is because the key field here is
     * just used as an indentifier and if a key is removed the Job should
     * never be touched again. */
8847 8848 8849
    zfree(j);
}

8850 8851 8852 8853 8854 8855 8856
/* Every time a thread finished a Job, it writes a byte into the write side
 * of an unix pipe in order to "awake" the main thread, and this function
 * is called. */
static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
            int mask)
{
    char buf[1];
8857
    int retval, processed = 0, toprocess = -1, trytoswap = 1;
8858 8859 8860 8861 8862 8863 8864
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);
    REDIS_NOTUSED(privdata);

    /* For every byte we read in the read side of the pipe, there is one
     * I/O job completed to process. */
    while((retval = read(fd,buf,1)) == 1) {
8865 8866 8867 8868 8869
        iojob *j;
        listNode *ln;
        robj *key;
        struct dictEntry *de;

8870
        redisLog(REDIS_DEBUG,"Processing I/O completed job");
8871 8872 8873

        /* Get the processed element (the oldest one) */
        lockThreadedIO();
A
antirez 已提交
8874
        assert(listLength(server.io_processed) != 0);
8875 8876 8877 8878
        if (toprocess == -1) {
            toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
            if (toprocess <= 0) toprocess = 1;
        }
8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889
        ln = listFirst(server.io_processed);
        j = ln->value;
        listDelNode(server.io_processed,ln);
        unlockThreadedIO();
        /* If this job is marked as canceled, just ignore it */
        if (j->canceled) {
            freeIOJob(j);
            continue;
        }
        /* Post process it in the main thread, as there are things we
         * can do just here to avoid race conditions and/or invasive locks */
8890
        redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
8891 8892 8893 8894
        de = dictFind(j->db->dict,j->key);
        assert(de != NULL);
        key = dictGetEntryKey(de);
        if (j->type == REDIS_IOJOB_LOAD) {
A
antirez 已提交
8895 8896
            redisDb *db;

8897 8898 8899 8900 8901 8902 8903 8904
            /* Key loaded, bring it at home */
            key->storage = REDIS_VM_MEMORY;
            key->vm.atime = server.unixtime;
            vmMarkPagesFree(key->vm.page,key->vm.usedpages);
            redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
                (unsigned char*) key->ptr);
            server.vm_stats_swapped_objects--;
            server.vm_stats_swapins++;
A
antirez 已提交
8905 8906 8907
            dictGetEntryVal(de) = j->val;
            incrRefCount(j->val);
            db = j->db;
8908
            freeIOJob(j);
A
antirez 已提交
8909 8910
            /* Handle clients waiting for this key to be loaded. */
            handleClientsBlockedOnSwappedKey(db,key);
8911 8912 8913 8914
        } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
            /* Now we know the amount of pages required to swap this object.
             * Let's find some space for it, and queue this task again
             * rebranded as REDIS_IOJOB_DO_SWAP. */
8915 8916 8917 8918 8919
            if (!vmCanSwapOut() ||
                vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
            {
                /* Ooops... no space or we can't swap as there is
                 * a fork()ed Redis trying to save stuff on disk. */
8920
                freeIOJob(j);
8921
                key->storage = REDIS_VM_MEMORY; /* undo operation */
8922
            } else {
A
antirez 已提交
8923 8924 8925 8926
                /* Note that we need to mark this pages as used now,
                 * if the job will be canceled, we'll mark them as freed
                 * again. */
                vmMarkPagesUsed(j->page,j->pages);
8927 8928 8929 8930 8931 8932 8933 8934 8935
                j->type = REDIS_IOJOB_DO_SWAP;
                lockThreadedIO();
                queueIOJob(j);
                unlockThreadedIO();
            }
        } else if (j->type == REDIS_IOJOB_DO_SWAP) {
            robj *val;

            /* Key swapped. We can finally free some memory. */
8936 8937 8938 8939 8940 8941 8942 8943 8944
            if (key->storage != REDIS_VM_SWAPPING) {
                printf("key->storage: %d\n",key->storage);
                printf("key->name: %s\n",(char*)key->ptr);
                printf("key->refcount: %d\n",key->refcount);
                printf("val: %p\n",(void*)j->val);
                printf("val->type: %d\n",j->val->type);
                printf("val->ptr: %s\n",(char*)j->val->ptr);
            }
            redisAssert(key->storage == REDIS_VM_SWAPPING);
8945 8946 8947 8948 8949 8950
            val = dictGetEntryVal(de);
            key->vm.page = j->page;
            key->vm.usedpages = j->pages;
            key->storage = REDIS_VM_SWAPPED;
            key->vtype = j->val->type;
            decrRefCount(val); /* Deallocate the object from memory. */
A
antirez 已提交
8951
            dictGetEntryVal(de) = NULL;
8952 8953 8954 8955 8956 8957 8958
            redisLog(REDIS_DEBUG,
                "VM: object %s swapped out at %lld (%lld pages) (threaded)",
                (unsigned char*) key->ptr,
                (unsigned long long) j->page, (unsigned long long) j->pages);
            server.vm_stats_swapped_objects++;
            server.vm_stats_swapouts++;
            freeIOJob(j);
A
antirez 已提交
8959 8960
            /* Put a few more swap requests in queue if we are still
             * out of memory */
8961 8962 8963
            if (trytoswap && vmCanSwapOut() &&
                zmalloc_used_memory() > server.vm_max_memory)
            {
A
antirez 已提交
8964 8965 8966 8967 8968 8969 8970
                int more = 1;
                while(more) {
                    lockThreadedIO();
                    more = listLength(server.io_newjobs) <
                            (unsigned) server.vm_max_threads;
                    unlockThreadedIO();
                    /* Don't waste CPU time if swappable objects are rare. */
8971 8972 8973 8974
                    if (vmSwapOneObjectThreaded() == REDIS_ERR) {
                        trytoswap = 0;
                        break;
                    }
A
antirez 已提交
8975 8976
                }
            }
8977
        }
8978
        processed++;
8979
        if (processed == toprocess) return;
8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999
    }
    if (retval < 0 && errno != EAGAIN) {
        redisLog(REDIS_WARNING,
            "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
            strerror(errno));
    }
}

static void lockThreadedIO(void) {
    pthread_mutex_lock(&server.io_mutex);
}

static void unlockThreadedIO(void) {
    pthread_mutex_unlock(&server.io_mutex);
}

/* Remove the specified object from the threaded I/O queue if still not
 * processed, otherwise make sure to flag it as canceled. */
static void vmCancelThreadedIOJob(robj *o) {
    list *lists[3] = {
9000 9001 9002
        server.io_newjobs,      /* 0 */
        server.io_processing,   /* 1 */
        server.io_processed     /* 2 */
9003 9004 9005 9006
    };
    int i;

    assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
9007
again:
9008 9009 9010 9011
    lockThreadedIO();
    /* Search for a matching key in one of the queues */
    for (i = 0; i < 3; i++) {
        listNode *ln;
A
antirez 已提交
9012
        listIter li;
9013

A
antirez 已提交
9014 9015
        listRewind(lists[i],&li);
        while ((ln = listNext(&li)) != NULL) {
9016 9017
            iojob *job = ln->value;

9018
            if (job->canceled) continue; /* Skip this, already canceled. */
9019
            if (job->key == o) {
9020 9021
                redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
                    (void*)job, (char*)o->ptr, job->type, i);
9022 9023
                /* Mark the pages as free since the swap didn't happened
                 * or happened but is now discarded. */
9024
                if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
9025 9026 9027
                    vmMarkPagesFree(job->page,job->pages);
                /* Cancel the job. It depends on the list the job is
                 * living in. */
9028 9029
                switch(i) {
                case 0: /* io_newjobs */
9030
                    /* If the job was yet not processed the best thing to do
9031
                     * is to remove it from the queue at all */
9032
                    freeIOJob(job);
9033 9034 9035
                    listDelNode(lists[i],ln);
                    break;
                case 1: /* io_processing */
A
antirez 已提交
9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054
                    /* Oh Shi- the thread is messing with the Job:
                     *
                     * Probably it's accessing the object if this is a
                     * PREPARE_SWAP or DO_SWAP job.
                     * If it's a LOAD job it may be reading from disk and
                     * if we don't wait for the job to terminate before to
                     * cancel it, maybe in a few microseconds data can be
                     * corrupted in this pages. So the short story is:
                     *
                     * Better to wait for the job to move into the
                     * next queue (processed)... */

                    /* We try again and again until the job is completed. */
                    unlockThreadedIO();
                    /* But let's wait some time for the I/O thread
                     * to finish with this job. After all this condition
                     * should be very rare. */
                    usleep(1);
                    goto again;
9055
                case 2: /* io_processed */
9056 9057 9058
                    /* The job was already processed, that's easy...
                     * just mark it as canceled so that we'll ignore it
                     * when processing completed jobs. */
9059 9060 9061
                    job->canceled = 1;
                    break;
                }
A
antirez 已提交
9062 9063
                /* Finally we have to adjust the storage type of the object
                 * in order to "UNDO" the operaiton. */
9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076
                if (o->storage == REDIS_VM_LOADING)
                    o->storage = REDIS_VM_SWAPPED;
                else if (o->storage == REDIS_VM_SWAPPING)
                    o->storage = REDIS_VM_MEMORY;
                unlockThreadedIO();
                return;
            }
        }
    }
    unlockThreadedIO();
    assert(1 != 1); /* We should never reach this */
}

9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087
static void *IOThreadEntryPoint(void *arg) {
    iojob *j;
    listNode *ln;
    REDIS_NOTUSED(arg);

    pthread_detach(pthread_self());
    while(1) {
        /* Get a new job to process */
        lockThreadedIO();
        if (listLength(server.io_newjobs) == 0) {
            /* No new jobs in queue, exit. */
9088 9089
            redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
                (long) pthread_self());
9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101
            server.io_active_threads--;
            unlockThreadedIO();
            return NULL;
        }
        ln = listFirst(server.io_newjobs);
        j = ln->value;
        listDelNode(server.io_newjobs,ln);
        /* Add the job in the processing queue */
        j->thread = pthread_self();
        listAddNodeTail(server.io_processing,j);
        ln = listLast(server.io_processing); /* We use ln later to remove it */
        unlockThreadedIO();
9102 9103
        redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
            (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
9104 9105 9106

        /* Process the Job */
        if (j->type == REDIS_IOJOB_LOAD) {
A
antirez 已提交
9107
            j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
9108 9109 9110 9111 9112
        } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
            FILE *fp = fopen("/dev/null","w+");
            j->pages = rdbSavedObjectPages(j->val,fp);
            fclose(fp);
        } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9113 9114
            if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
                j->canceled = 1;
9115 9116 9117
        }

        /* Done: insert the job into the processed queue */
9118 9119
        redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
            (long) pthread_self(), (void*)j, (char*)j->key->ptr);
9120 9121 9122 9123
        lockThreadedIO();
        listDelNode(server.io_processing,ln);
        listAddNodeTail(server.io_processed,j);
        unlockThreadedIO();
A
Alex McHale 已提交
9124

9125 9126 9127 9128 9129 9130 9131 9132
        /* Signal the main thread there is new stuff to process */
        assert(write(server.io_ready_pipe_write,"x",1) == 1);
    }
    return NULL; /* never reached */
}

static void spawnIOThread(void) {
    pthread_t thread;
9133
    sigset_t mask, omask;
9134
    int err;
9135

9136 9137 9138 9139 9140
    sigemptyset(&mask);
    sigaddset(&mask,SIGCHLD);
    sigaddset(&mask,SIGHUP);
    sigaddset(&mask,SIGPIPE);
    pthread_sigmask(SIG_SETMASK, &mask, &omask);
9141 9142 9143 9144 9145
    while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
        redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
            strerror(err));
        usleep(1000000);
    }
9146
    pthread_sigmask(SIG_SETMASK, &omask, NULL);
9147 9148 9149
    server.io_active_threads++;
}

9150 9151
/* We need to wait for the last thread to exit before we are able to
 * fork() in order to BGSAVE or BGREWRITEAOF. */
9152
static void waitEmptyIOJobsQueue(void) {
9153
    while(1) {
9154 9155
        int io_processed_len;

9156
        lockThreadedIO();
9157 9158 9159 9160
        if (listLength(server.io_newjobs) == 0 &&
            listLength(server.io_processing) == 0 &&
            server.io_active_threads == 0)
        {
9161 9162 9163
            unlockThreadedIO();
            return;
        }
9164 9165 9166 9167 9168
        /* While waiting for empty jobs queue condition we post-process some
         * finshed job, as I/O threads may be hanging trying to write against
         * the io_ready_pipe_write FD but there are so much pending jobs that
         * it's blocking. */
        io_processed_len = listLength(server.io_processed);
9169
        unlockThreadedIO();
9170 9171 9172 9173 9174 9175
        if (io_processed_len) {
            vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
            usleep(1000); /* 1 millisecond */
        } else {
            usleep(10000); /* 10 milliseconds */
        }
9176 9177 9178
    }
}

9179
static void vmReopenSwapFile(void) {
9180 9181
    /* Note: we don't close the old one as we are in the child process
     * and don't want to mess at all with the original file object. */
9182 9183 9184 9185
    server.vm_fp = fopen(server.vm_swap_file,"r+b");
    if (server.vm_fp == NULL) {
        redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
            server.vm_swap_file);
9186
        _exit(1);
9187 9188 9189 9190
    }
    server.vm_fd = fileno(server.vm_fp);
}

9191 9192
/* This function must be called while with threaded IO locked */
static void queueIOJob(iojob *j) {
9193 9194
    redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
        (void*)j, j->type, (char*)j->key->ptr);
9195 9196 9197 9198 9199 9200 9201
    listAddNodeTail(server.io_newjobs,j);
    if (server.io_active_threads < server.vm_max_threads)
        spawnIOThread();
}

static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
    iojob *j;
A
Alex McHale 已提交
9202

9203 9204 9205 9206 9207 9208
    assert(key->storage == REDIS_VM_MEMORY);
    assert(key->refcount == 1);

    j = zmalloc(sizeof(*j));
    j->type = REDIS_IOJOB_PREPARE_SWAP;
    j->db = db;
9209
    j->key = key;
9210 9211 9212 9213
    j->val = val;
    incrRefCount(val);
    j->canceled = 0;
    j->thread = (pthread_t) -1;
A
antirez 已提交
9214
    key->storage = REDIS_VM_SWAPPING;
9215 9216 9217 9218 9219 9220 9221

    lockThreadedIO();
    queueIOJob(j);
    unlockThreadedIO();
    return REDIS_OK;
}

9222 9223
/* ============ Virtual Memory - Blocking clients on missing keys =========== */

A
antirez 已提交
9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245
/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
 * If there is not already a job loading the key, it is craeted.
 * The key is added to the io_keys list in the client structure, and also
 * in the hash table mapping swapped keys to waiting clients, that is,
 * server.io_waited_keys. */
static int waitForSwappedKey(redisClient *c, robj *key) {
    struct dictEntry *de;
    robj *o;
    list *l;

    /* If the key does not exist or is already in RAM we don't need to
     * block the client at all. */
    de = dictFind(c->db->dict,key);
    if (de == NULL) return 0;
    o = dictGetEntryKey(de);
    if (o->storage == REDIS_VM_MEMORY) {
        return 0;
    } else if (o->storage == REDIS_VM_SWAPPING) {
        /* We were swapping the key, undo it! */
        vmCancelThreadedIOJob(o);
        return 0;
    }
A
Alex McHale 已提交
9246

A
antirez 已提交
9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276
    /* OK: the key is either swapped, or being loaded just now. */

    /* Add the key to the list of keys this client is waiting for.
     * This maps clients to keys they are waiting for. */
    listAddNodeTail(c->io_keys,key);
    incrRefCount(key);

    /* Add the client to the swapped keys => clients waiting map. */
    de = dictFind(c->db->io_keys,key);
    if (de == NULL) {
        int retval;

        /* For every key we take a list of clients blocked for it */
        l = listCreate();
        retval = dictAdd(c->db->io_keys,key,l);
        incrRefCount(key);
        assert(retval == DICT_OK);
    } else {
        l = dictGetEntryVal(de);
    }
    listAddNodeTail(l,c);

    /* Are we already loading the key from disk? If not create a job */
    if (o->storage == REDIS_VM_SWAPPED) {
        iojob *j;

        o->storage = REDIS_VM_LOADING;
        j = zmalloc(sizeof(*j));
        j->type = REDIS_IOJOB_LOAD;
        j->db = c->db;
9277
        j->key = o;
A
antirez 已提交
9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289
        j->key->vtype = o->vtype;
        j->page = o->vm.page;
        j->val = NULL;
        j->canceled = 0;
        j->thread = (pthread_t) -1;
        lockThreadedIO();
        queueIOJob(j);
        unlockThreadedIO();
    }
    return 1;
}

9290 9291 9292 9293 9294 9295 9296 9297 9298
/* Preload keys needed for the ZUNION and ZINTER commands. */
static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
    int i, num;
    num = atoi(c->argv[2]->ptr);
    for (i = 0; i < num; i++) {
        waitForSwappedKey(c,c->argv[3+i]);
    }
}

9299
/* Is this client attempting to run a command against swapped keys?
A
antirez 已提交
9300
 * If so, block it ASAP, load the keys in background, then resume it.
9301
 *
A
antirez 已提交
9302 9303 9304 9305 9306 9307 9308 9309
 * The important idea about this function is that it can fail! If keys will
 * still be swapped when the client is resumed, this key lookups will
 * just block loading keys from disk. In practical terms this should only
 * happen with SORT BY command or if there is a bug in this function.
 *
 * Return 1 if the client is marked as blocked, 0 if the client can
 * continue as the keys it is going to access appear to be in memory. */
static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
9310 9311
    int j, last;

9312 9313 9314 9315 9316 9317 9318 9319 9320 9321
    if (cmd->vm_preload_proc != NULL) {
        cmd->vm_preload_proc(c);
    } else {
        if (cmd->vm_firstkey == 0) return 0;
        last = cmd->vm_lastkey;
        if (last < 0) last = c->argc+last;
        for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
            waitForSwappedKey(c,c->argv[j]);
    }

A
antirez 已提交
9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388
    /* If the client was blocked for at least one key, mark it as blocked. */
    if (listLength(c->io_keys)) {
        c->flags |= REDIS_IO_WAIT;
        aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
        server.vm_blocked_clients++;
        return 1;
    } else {
        return 0;
    }
}

/* Remove the 'key' from the list of blocked keys for a given client.
 *
 * The function returns 1 when there are no longer blocking keys after
 * the current one was removed (and the client can be unblocked). */
static int dontWaitForSwappedKey(redisClient *c, robj *key) {
    list *l;
    listNode *ln;
    listIter li;
    struct dictEntry *de;

    /* Remove the key from the list of keys this client is waiting for. */
    listRewind(c->io_keys,&li);
    while ((ln = listNext(&li)) != NULL) {
        if (compareStringObjects(ln->value,key) == 0) {
            listDelNode(c->io_keys,ln);
            break;
        }
    }
    assert(ln != NULL);

    /* Remove the client form the key => waiting clients map. */
    de = dictFind(c->db->io_keys,key);
    assert(de != NULL);
    l = dictGetEntryVal(de);
    ln = listSearchKey(l,c);
    assert(ln != NULL);
    listDelNode(l,ln);
    if (listLength(l) == 0)
        dictDelete(c->db->io_keys,key);

    return listLength(c->io_keys) == 0;
}

static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
    struct dictEntry *de;
    list *l;
    listNode *ln;
    int len;

    de = dictFind(db->io_keys,key);
    if (!de) return;

    l = dictGetEntryVal(de);
    len = listLength(l);
    /* Note: we can't use something like while(listLength(l)) as the list
     * can be freed by the calling function when we remove the last element. */
    while (len--) {
        ln = listFirst(l);
        redisClient *c = ln->value;

        if (dontWaitForSwappedKey(c,key)) {
            /* Put the client in the list of clients ready to go as we
             * loaded all the keys about it. */
            listAddNodeTail(server.io_ready_clients,c);
        }
    }
9389 9390
}

9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478
/* =========================== Remote Configuration ========================= */

static void configSetCommand(redisClient *c) {
    robj *o = getDecodedObject(c->argv[3]);
    if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
        zfree(server.dbfilename);
        server.dbfilename = zstrdup(o->ptr);
    } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
        zfree(server.requirepass);
        server.requirepass = zstrdup(o->ptr);
    } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
        zfree(server.masterauth);
        server.masterauth = zstrdup(o->ptr);
    } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
        server.maxmemory = strtoll(o->ptr, NULL, 10);
    } else {
        addReplySds(c,sdscatprintf(sdsempty(),
            "-ERR not supported CONFIG parameter %s\r\n",
            (char*)c->argv[2]->ptr));
        decrRefCount(o);
        return;
    }
    decrRefCount(o);
    addReply(c,shared.ok);
}

static void configGetCommand(redisClient *c) {
    robj *o = getDecodedObject(c->argv[2]);
    robj *lenobj = createObject(REDIS_STRING,NULL);
    char *pattern = o->ptr;
    int matches = 0;

    addReply(c,lenobj);
    decrRefCount(lenobj);

    if (stringmatch(pattern,"dbfilename",0)) {
        addReplyBulkCString(c,"dbfilename");
        addReplyBulkCString(c,server.dbfilename);
        matches++;
    }
    if (stringmatch(pattern,"requirepass",0)) {
        addReplyBulkCString(c,"requirepass");
        addReplyBulkCString(c,server.requirepass);
        matches++;
    }
    if (stringmatch(pattern,"masterauth",0)) {
        addReplyBulkCString(c,"masterauth");
        addReplyBulkCString(c,server.masterauth);
        matches++;
    }
    if (stringmatch(pattern,"maxmemory",0)) {
        char buf[128];

        snprintf(buf,128,"%llu\n",server.maxmemory);
        addReplyBulkCString(c,"maxmemory");
        addReplyBulkCString(c,buf);
        matches++;
    }
    decrRefCount(o);
    lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
}

static void configCommand(redisClient *c) {
    if (!strcasecmp(c->argv[1]->ptr,"set")) {
        if (c->argc != 4) goto badarity;
        configSetCommand(c);
    } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
        if (c->argc != 3) goto badarity;
        configGetCommand(c);
    } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
        if (c->argc != 2) goto badarity;
        server.stat_numcommands = 0;
        server.stat_numconnections = 0;
        server.stat_expiredkeys = 0;
        server.stat_starttime = time(NULL);
        addReply(c,shared.ok);
    } else {
        addReplySds(c,sdscatprintf(sdsempty(),
            "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
    }
    return;

badarity:
    addReplySds(c,sdscatprintf(sdsempty(),
        "-ERR Wrong number of arguments for CONFIG %s\r\n",
        (char*) c->argv[1]->ptr));
}

A
antirez 已提交
9479 9480
/* =========================== Pubsub implementation ======================== */

A
antirez 已提交
9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497
static void freePubsubPattern(void *p) {
    pubsubPattern *pat = p;

    decrRefCount(pat->pattern);
    zfree(pat);
}

static int listMatchPubsubPattern(void *a, void *b) {
    pubsubPattern *pa = a, *pb = b;

    return (pa->client == pb->client) &&
           (compareStringObjects(pa->pattern,pb->pattern) == 0);
}

/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
 * 0 if the client was already subscribed to that channel. */
static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
A
antirez 已提交
9498 9499 9500 9501
    struct dictEntry *de;
    list *clients = NULL;
    int retval = 0;

A
antirez 已提交
9502 9503
    /* Add the channel to the client -> channels hash table */
    if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
A
antirez 已提交
9504
        retval = 1;
A
antirez 已提交
9505 9506 9507
        incrRefCount(channel);
        /* Add the client to the channel -> list of clients hash table */
        de = dictFind(server.pubsub_channels,channel);
A
antirez 已提交
9508 9509
        if (de == NULL) {
            clients = listCreate();
A
antirez 已提交
9510 9511
            dictAdd(server.pubsub_channels,channel,clients);
            incrRefCount(channel);
A
antirez 已提交
9512 9513 9514 9515 9516 9517 9518 9519
        } else {
            clients = dictGetEntryVal(de);
        }
        listAddNodeTail(clients,c);
    }
    /* Notify the client */
    addReply(c,shared.mbulk3);
    addReply(c,shared.subscribebulk);
A
antirez 已提交
9520 9521
    addReplyBulk(c,channel);
    addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
A
antirez 已提交
9522 9523 9524
    return retval;
}

A
antirez 已提交
9525 9526 9527
/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
 * 0 if the client was not subscribed to the specified channel. */
static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
A
antirez 已提交
9528 9529 9530 9531 9532
    struct dictEntry *de;
    list *clients;
    listNode *ln;
    int retval = 0;

A
antirez 已提交
9533 9534
    /* Remove the channel from the client -> channels hash table */
    incrRefCount(channel); /* channel may be just a pointer to the same object
9535
                            we have in the hash tables. Protect it... */
A
antirez 已提交
9536
    if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
A
antirez 已提交
9537
        retval = 1;
A
antirez 已提交
9538 9539
        /* Remove the client from the channel -> clients list hash table */
        de = dictFind(server.pubsub_channels,channel);
A
antirez 已提交
9540 9541 9542 9543 9544
        assert(de != NULL);
        clients = dictGetEntryVal(de);
        ln = listSearchKey(clients,c);
        assert(ln != NULL);
        listDelNode(clients,ln);
9545 9546 9547
        if (listLength(clients) == 0) {
            /* Free the list and associated hash entry at all if this was
             * the latest client, so that it will be possible to abuse
A
antirez 已提交
9548 9549
             * Redis PUBSUB creating millions of channels. */
            dictDelete(server.pubsub_channels,channel);
9550
        }
A
antirez 已提交
9551 9552 9553 9554 9555
    }
    /* Notify the client */
    if (notify) {
        addReply(c,shared.mbulk3);
        addReply(c,shared.unsubscribebulk);
A
antirez 已提交
9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609
        addReplyBulk(c,channel);
        addReplyLong(c,dictSize(c->pubsub_channels)+
                       listLength(c->pubsub_patterns));

    }
    decrRefCount(channel); /* it is finally safe to release it */
    return retval;
}

/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
    int retval = 0;

    if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
        retval = 1;
        pubsubPattern *pat;
        listAddNodeTail(c->pubsub_patterns,pattern);
        incrRefCount(pattern);
        pat = zmalloc(sizeof(*pat));
        pat->pattern = getDecodedObject(pattern);
        pat->client = c;
        listAddNodeTail(server.pubsub_patterns,pat);
    }
    /* Notify the client */
    addReply(c,shared.mbulk3);
    addReply(c,shared.psubscribebulk);
    addReplyBulk(c,pattern);
    addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
    return retval;
}

/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
 * 0 if the client was not subscribed to the specified channel. */
static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
    listNode *ln;
    pubsubPattern pat;
    int retval = 0;

    incrRefCount(pattern); /* Protect the object. May be the same we remove */
    if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
        retval = 1;
        listDelNode(c->pubsub_patterns,ln);
        pat.client = c;
        pat.pattern = pattern;
        ln = listSearchKey(server.pubsub_patterns,&pat);
        listDelNode(server.pubsub_patterns,ln);
    }
    /* Notify the client */
    if (notify) {
        addReply(c,shared.mbulk3);
        addReply(c,shared.punsubscribebulk);
        addReplyBulk(c,pattern);
        addReplyLong(c,dictSize(c->pubsub_channels)+
                       listLength(c->pubsub_patterns));
A
antirez 已提交
9610
    }
A
antirez 已提交
9611
    decrRefCount(pattern);
A
antirez 已提交
9612 9613 9614
    return retval;
}

A
antirez 已提交
9615 9616 9617 9618
/* Unsubscribe from all the channels. Return the number of channels the
 * client was subscribed from. */
static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
    dictIterator *di = dictGetIterator(c->pubsub_channels);
A
antirez 已提交
9619 9620 9621 9622
    dictEntry *de;
    int count = 0;

    while((de = dictNext(di)) != NULL) {
A
antirez 已提交
9623
        robj *channel = dictGetEntryKey(de);
A
antirez 已提交
9624

A
antirez 已提交
9625
        count += pubsubUnsubscribeChannel(c,channel,notify);
A
antirez 已提交
9626 9627 9628 9629 9630
    }
    dictReleaseIterator(di);
    return count;
}

A
antirez 已提交
9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646
/* Unsubscribe from all the patterns. Return the number of patterns the
 * client was subscribed from. */
static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
    listNode *ln;
    listIter li;
    int count = 0;

    listRewind(c->pubsub_patterns,&li);
    while ((ln = listNext(&li)) != NULL) {
        robj *pattern = ln->value;

        count += pubsubUnsubscribePattern(c,pattern,notify);
    }
    return count;
}

A
antirez 已提交
9647
/* Publish a message */
A
antirez 已提交
9648
static int pubsubPublishMessage(robj *channel, robj *message) {
A
antirez 已提交
9649 9650
    int receivers = 0;
    struct dictEntry *de;
A
antirez 已提交
9651 9652
    listNode *ln;
    listIter li;
A
antirez 已提交
9653

A
antirez 已提交
9654 9655
    /* Send to clients listening for that channel */
    de = dictFind(server.pubsub_channels,channel);
A
antirez 已提交
9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666
    if (de) {
        list *list = dictGetEntryVal(de);
        listNode *ln;
        listIter li;

        listRewind(list,&li);
        while ((ln = listNext(&li)) != NULL) {
            redisClient *c = ln->value;

            addReply(c,shared.mbulk3);
            addReply(c,shared.messagebulk);
A
antirez 已提交
9667
            addReplyBulk(c,channel);
A
antirez 已提交
9668 9669 9670 9671
            addReplyBulk(c,message);
            receivers++;
        }
    }
A
antirez 已提交
9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691
    /* Send to clients listening to matching channels */
    if (listLength(server.pubsub_patterns)) {
        listRewind(server.pubsub_patterns,&li);
        channel = getDecodedObject(channel);
        while ((ln = listNext(&li)) != NULL) {
            pubsubPattern *pat = ln->value;

            if (stringmatchlen((char*)pat->pattern->ptr,
                                sdslen(pat->pattern->ptr),
                                (char*)channel->ptr,
                                sdslen(channel->ptr),0)) {
                addReply(pat->client,shared.mbulk3);
                addReply(pat->client,shared.messagebulk);
                addReplyBulk(pat->client,channel);
                addReplyBulk(pat->client,message);
                receivers++;
            }
        }
        decrRefCount(channel);
    }
A
antirez 已提交
9692 9693 9694 9695 9696 9697 9698
    return receivers;
}

static void subscribeCommand(redisClient *c) {
    int j;

    for (j = 1; j < c->argc; j++)
A
antirez 已提交
9699
        pubsubSubscribeChannel(c,c->argv[j]);
A
antirez 已提交
9700 9701 9702 9703
}

static void unsubscribeCommand(redisClient *c) {
    if (c->argc == 1) {
A
antirez 已提交
9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723
        pubsubUnsubscribeAllChannels(c,1);
        return;
    } else {
        int j;

        for (j = 1; j < c->argc; j++)
            pubsubUnsubscribeChannel(c,c->argv[j],1);
    }
}

static void psubscribeCommand(redisClient *c) {
    int j;

    for (j = 1; j < c->argc; j++)
        pubsubSubscribePattern(c,c->argv[j]);
}

static void punsubscribeCommand(redisClient *c) {
    if (c->argc == 1) {
        pubsubUnsubscribeAllPatterns(c,1);
A
antirez 已提交
9724 9725 9726 9727 9728
        return;
    } else {
        int j;

        for (j = 1; j < c->argc; j++)
A
antirez 已提交
9729
            pubsubUnsubscribePattern(c,c->argv[j],1);
A
antirez 已提交
9730 9731 9732 9733 9734 9735 9736 9737
    }
}

static void publishCommand(redisClient *c) {
    int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
    addReplyLong(c,receivers);
}

9738 9739 9740 9741 9742
/* ================================= Debugging ============================== */

static void debugCommand(redisClient *c) {
    if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
        *((char*)-1) = 'x';
9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754
    } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
        if (rdbSave(server.dbfilename) != REDIS_OK) {
            addReply(c,shared.err);
            return;
        }
        emptyDb();
        if (rdbLoad(server.dbfilename) != REDIS_OK) {
            addReply(c,shared.err);
            return;
        }
        redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
        addReply(c,shared.ok);
9755 9756 9757 9758 9759 9760 9761 9762
    } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
        emptyDb();
        if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
            addReply(c,shared.err);
            return;
        }
        redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
        addReply(c,shared.ok);
A
antirez 已提交
9763 9764 9765 9766 9767 9768 9769 9770 9771 9772
    } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
        dictEntry *de = dictFind(c->db->dict,c->argv[2]);
        robj *key, *val;

        if (!de) {
            addReply(c,shared.nokeyerr);
            return;
        }
        key = dictGetEntryKey(de);
        val = dictGetEntryVal(de);
9773 9774
        if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
                                   key->storage == REDIS_VM_SWAPPING)) {
9775 9776 9777 9778 9779 9780 9781 9782 9783
            char *strenc;
            char buf[128];

            if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
                strenc = strencoding[val->encoding];
            } else {
                snprintf(buf,64,"unknown encoding %d\n", val->encoding);
                strenc = buf;
            }
9784 9785
            addReplySds(c,sdscatprintf(sdsempty(),
                "+Key at:%p refcount:%d, value at:%p refcount:%d "
9786
                "encoding:%s serializedlength:%lld\r\n",
9787
                (void*)key, key->refcount, (void*)val, val->refcount,
9788
                strenc, (long long) rdbSavedObjectLen(val,NULL)));
9789 9790 9791 9792 9793 9794 9795
        } else {
            addReplySds(c,sdscatprintf(sdsempty(),
                "+Key at:%p refcount:%d, value swapped at: page %llu "
                "using %llu pages\r\n",
                (void*)key, key->refcount, (unsigned long long) key->vm.page,
                (unsigned long long) key->vm.usedpages));
        }
9796 9797 9798
    } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
        lookupKeyRead(c->db,c->argv[2]);
        addReply(c,shared.ok);
A
antirez 已提交
9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812
    } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
        dictEntry *de = dictFind(c->db->dict,c->argv[2]);
        robj *key, *val;

        if (!server.vm_enabled) {
            addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
            return;
        }
        if (!de) {
            addReply(c,shared.nokeyerr);
            return;
        }
        key = dictGetEntryKey(de);
        val = dictGetEntryVal(de);
A
antirez 已提交
9813 9814 9815 9816 9817 9818 9819
        /* If the key is shared we want to create a copy */
        if (key->refcount > 1) {
            robj *newkey = dupStringObject(key);
            decrRefCount(key);
            key = dictGetEntryKey(de) = newkey;
        }
        /* Swap it */
A
antirez 已提交
9820 9821
        if (key->storage != REDIS_VM_MEMORY) {
            addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
9822
        } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
A
antirez 已提交
9823 9824 9825 9826 9827
            dictGetEntryVal(de) = NULL;
            addReply(c,shared.ok);
        } else {
            addReply(c,shared.err);
        }
9828
    } else {
A
antirez 已提交
9829
        addReplySds(c,sdsnew(
9830
            "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
9831 9832
    }
}
9833

9834
static void _redisAssert(char *estr, char *file, int line) {
9835
    redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
9836
    redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
9837 9838 9839 9840 9841 9842
#ifdef HAVE_BACKTRACE
    redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
    *((char*)-1) = 'x';
#endif
}

9843
/* =================================== Main! ================================ */
9844

9845 9846 9847 9848
#ifdef __linux__
int linuxOvercommitMemoryValue(void) {
    FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
    char buf[64];
9849

9850 9851 9852 9853 9854 9855
    if (!fp) return -1;
    if (fgets(buf,64,fp) == NULL) {
        fclose(fp);
        return -1;
    }
    fclose(fp);
9856

9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887
    return atoi(buf);
}

void linuxOvercommitMemoryWarning(void) {
    if (linuxOvercommitMemoryValue() == 0) {
        redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
    }
}
#endif /* __linux__ */

static void daemonize(void) {
    int fd;
    FILE *fp;

    if (fork() != 0) exit(0); /* parent exits */
    setsid(); /* create a new session */

    /* Every output goes to /dev/null. If Redis is daemonized but
     * the 'logfile' is set to 'stdout' in the configuration file
     * it will not log at all. */
    if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
        dup2(fd, STDIN_FILENO);
        dup2(fd, STDOUT_FILENO);
        dup2(fd, STDERR_FILENO);
        if (fd > STDERR_FILENO) close(fd);
    }
    /* Try to write the pid file */
    fp = fopen(server.pidfile,"w");
    if (fp) {
        fprintf(fp,"%d\n",getpid());
        fclose(fp);
9888 9889 9890
    }
}

9891 9892 9893 9894 9895
static void version() {
    printf("Redis server version %s\n", REDIS_VERSION);
    exit(0);
}

9896 9897
static void usage() {
    fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
9898
    fprintf(stderr,"       ./redis-server - (read config from stdin)\n");
9899 9900 9901
    exit(1);
}

9902
int main(int argc, char **argv) {
9903 9904
    time_t start;

9905 9906
    initServerConfig();
    if (argc == 2) {
9907 9908 9909
        if (strcmp(argv[1], "-v") == 0 ||
            strcmp(argv[1], "--version") == 0) version();
        if (strcmp(argv[1], "--help") == 0) usage();
9910 9911
        resetServerSaveParams();
        loadServerConfig(argv[1]);
9912 9913
    } else if ((argc > 2)) {
        usage();
9914 9915 9916 9917
    } else {
        redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
    }
    if (server.daemonize) daemonize();
9918
    initServer();
9919 9920 9921 9922
    redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
#ifdef __linux__
    linuxOvercommitMemoryWarning();
#endif
9923
    start = time(NULL);
9924 9925
    if (server.appendonly) {
        if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9926
            redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
9927 9928
    } else {
        if (rdbLoad(server.dbfilename) == REDIS_OK)
9929
            redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
9930 9931
    }
    redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
A
antirez 已提交
9932
    aeSetBeforeSleepProc(server.el,beforeSleep);
9933 9934 9935 9936 9937 9938 9939 9940 9941 9942
    aeMain(server.el);
    aeDeleteEventLoop(server.el);
    return 0;
}

/* ============================= Backtrace support ========================= */

#ifdef HAVE_BACKTRACE
static char *findFuncName(void *pointer, unsigned long *offset);

9943 9944 9945 9946 9947
static void *getMcontextEip(ucontext_t *uc) {
#if defined(__FreeBSD__)
    return (void*) uc->uc_mcontext.mc_eip;
#elif defined(__dietlibc__)
    return (void*) uc->uc_mcontext.eip;
9948
#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
9949 9950 9951
  #if __x86_64__
    return (void*) uc->uc_mcontext->__ss.__rip;
  #else
9952
    return (void*) uc->uc_mcontext->__ss.__eip;
9953
  #endif
9954
#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
9955
  #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
9956
    return (void*) uc->uc_mcontext->__ss.__rip;
9957 9958
  #else
    return (void*) uc->uc_mcontext->__ss.__eip;
A
Alex McHale 已提交
9959
  #endif
A
antirez 已提交
9960
#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
A
antirez 已提交
9961
    return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
9962 9963 9964 9965
#elif defined(__ia64__) /* Linux IA64 */
    return (void*) uc->uc_mcontext.sc_ip;
#else
    return NULL;
9966 9967 9968 9969 9970 9971 9972 9973 9974
#endif
}

static void segvHandler(int sig, siginfo_t *info, void *secret) {
    void *trace[100];
    char **messages = NULL;
    int i, trace_size = 0;
    unsigned long offset=0;
    ucontext_t *uc = (ucontext_t*) secret;
9975
    sds infostring;
9976 9977 9978 9979
    REDIS_NOTUSED(info);

    redisLog(REDIS_WARNING,
        "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
9980 9981 9982 9983
    infostring = genRedisInfoString();
    redisLog(REDIS_WARNING, "%s",infostring);
    /* It's not safe to sdsfree() the returned string under memory
     * corruption conditions. Let it leak as we are going to abort */
A
Alex McHale 已提交
9984

9985
    trace_size = backtrace(trace, 100);
9986
    /* overwrite sigaction with caller's address */
9987 9988 9989
    if (getMcontextEip(uc) != NULL) {
        trace[1] = getMcontextEip(uc);
    }
9990
    messages = backtrace_symbols(trace, trace_size);
H
hrothgar 已提交
9991

9992
    for (i=1; i<trace_size; ++i) {
9993 9994 9995 9996 9997 9998 9999 10000 10001
        char *fn = findFuncName(trace[i], &offset), *p;

        p = strchr(messages[i],'+');
        if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
            redisLog(REDIS_WARNING,"%s", messages[i]);
        } else {
            redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
        }
    }
10002
    /* free(messages); Don't call free() with possibly corrupted memory. */
10003
    _exit(0);
H
hrothgar 已提交
10004
}
10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015

static void setupSigSegvAction(void) {
    struct sigaction act;

    sigemptyset (&act.sa_mask);
    /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
     * is used. Otherwise, sa_handler is used */
    act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
    act.sa_sigaction = segvHandler;
    sigaction (SIGSEGV, &act, NULL);
    sigaction (SIGBUS, &act, NULL);
10016 10017 10018
    sigaction (SIGFPE, &act, NULL);
    sigaction (SIGILL, &act, NULL);
    sigaction (SIGBUS, &act, NULL);
10019
    return;
10020
}
10021

10022 10023 10024 10025 10026 10027 10028
#include "staticsymbols.h"
/* This function try to convert a pointer into a function name. It's used in
 * oreder to provide a backtrace under segmentation fault that's able to
 * display functions declared as static (otherwise the backtrace is useless). */
static char *findFuncName(void *pointer, unsigned long *offset){
    int i, ret = -1;
    unsigned long off, minoff = 0;
A
antirez 已提交
10029

10030 10031 10032
    /* Try to match against the Symbol with the smallest offset */
    for (i=0; symsTable[i].pointer; i++) {
        unsigned long lp = (unsigned long) pointer;
10033

10034 10035 10036 10037 10038 10039 10040
        if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
            off=lp-symsTable[i].pointer;
            if (ret < 0 || off < minoff) {
                minoff=off;
                ret=i;
            }
        }
10041
    }
10042 10043 10044
    if (ret == -1) return NULL;
    *offset = minoff;
    return symsTable[ret].name;
10045
}
10046 10047
#else /* HAVE_BACKTRACE */
static void setupSigSegvAction(void) {
10048
}
10049
#endif /* HAVE_BACKTRACE */
10050

A
antirez 已提交
10051 10052


10053 10054 10055
/* The End */


A
antirez 已提交
10056