From 307a189900c06bb4f76638277275f70f2e480558 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 30 Mar 2014 19:26:13 +0200 Subject: [PATCH] HyperLogLog approximated cardinality caching. The more we add elements to an HyperLogLog counter, the smaller is the probability that we actually update some register. From this observation it is easy to see how it is possible to use caching of a previously computed cardinality and reuse it to serve HLLCOUNT queries as long as no register was updated in the data structure. This commit does exactly this by using just additional 8 bytes for the data structure to store a 64 bit unsigned integer value cached cardinality. When the most significant bit of the 64 bit integer is set, it means that the value computed is no longer usable since at least a single register was modified and we need to recompute it at the next call of HLLCOUNT. The value is always stored in little endian format regardless of the actual CPU endianess. --- src/hyperloglog.c | 54 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 48d3489a2..aebed3e74 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -51,14 +51,35 @@ * * [2] P. Flajolet, Éric Fusy, O. Gandouet, and F. Meunier. Hyperloglog: The * analysis of a near-optimal cardinality estimation algorithm. - */ + * + * The representation used by Redis is the following: + * + * +--------+--------+--------+------// //--+---------------------+ + * |11000000|22221111|33333322|55444444 .... | 64 bit cardinality | + * +--------+--------+--------+------// //--+---------------------+ + * + * The 6 bits counters are encoded one after the other starting from the + * LSB to the MSB, and using the next bytes as needed. + * + * At the end of the 16k counters, there is an additional 64 bit integer + * stored in little endian format with the latest cardinality computed that + * can be reused if the data structure was not modified since the last + * computation (this is useful because there are high probabilities that + * HLLADD operations don't modify the actual data structure and hence the + * approximated cardinality). + * + * When the most significant bit in the most significant byte of the cached + * cardinality is set, it means that the data structure was modified and + * we can't reuse the cached value that must be recomputed. */ #define REDIS_HLL_P 14 /* The greater is P, the smaller the error. */ #define REDIS_HLL_REGISTERS (1<db,c->argv[1]); notifyKeyspaceEvent(REDIS_NOTIFY_STRING,"hlladd",c->argv[1],c->db->id); server.dirty++; + /* Invalidate the cached cardinality. */ + registers[REDIS_HLL_SIZE-1] |= (1<<7); } addReply(c, updated ? shared.cone : shared.czero); } @@ -447,6 +470,7 @@ void hllAddCommand(redisClient *c) { void hllCountCommand(redisClient *c) { robj *o = lookupKeyRead(c->db,c->argv[1]); uint8_t *registers; + uint64_t card; if (o == NULL) { /* No key? Cardinality is zero since no element was added, otherwise @@ -465,8 +489,32 @@ void hllCountCommand(redisClient *c) { REDIS_HLL_SIZE); return; } + + /* Check if the cached cardinality is valid. */ registers = o->ptr; - addReplyLongLong(c,hllCount(registers)); + if ((registers[REDIS_HLL_SIZE-1] & (1<<7)) == 0) { + /* Just return the cached value. */ + card = (uint64_t)registers[REDIS_HLL_SIZE-8]; + card |= (uint64_t)registers[REDIS_HLL_SIZE-7] << 8; + card |= (uint64_t)registers[REDIS_HLL_SIZE-6] << 16; + card |= (uint64_t)registers[REDIS_HLL_SIZE-5] << 24; + card |= (uint64_t)registers[REDIS_HLL_SIZE-4] << 32; + card |= (uint64_t)registers[REDIS_HLL_SIZE-3] << 40; + card |= (uint64_t)registers[REDIS_HLL_SIZE-2] << 48; + card |= (uint64_t)registers[REDIS_HLL_SIZE-1] << 56; + } else { + /* Recompute it and update the cached value. */ + card = hllCount(registers); + registers[REDIS_HLL_SIZE-8] = card & 0xff; + registers[REDIS_HLL_SIZE-7] = (card >> 8) & 0xff; + registers[REDIS_HLL_SIZE-6] = (card >> 16) & 0xff; + registers[REDIS_HLL_SIZE-5] = (card >> 24) & 0xff; + registers[REDIS_HLL_SIZE-4] = (card >> 32) & 0xff; + registers[REDIS_HLL_SIZE-3] = (card >> 40) & 0xff; + registers[REDIS_HLL_SIZE-2] = (card >> 48) & 0xff; + registers[REDIS_HLL_SIZE-1] = (card >> 56) & 0xff; + } + addReplyLongLong(c,card); } } -- GitLab