From aadd513160f8c2405fda44e69f0b7b2901b9fa06 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 11 Apr 2014 17:27:40 +0200 Subject: [PATCH] HyperLogLog sparse representation initial implementation. Code never tested, but the basic layout is shaped in this commit. Also missing: 1) Sparse -> Dense conversion function. 2) New HLL object creation using the sparse representation. 3) Implementation of PFMERGE for the sparse representation. --- src/hyperloglog.c | 278 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 269 insertions(+), 9 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 0cd92a0a..a95a21c9 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -112,7 +112,7 @@ * XZERO opcode is represented by two bytes 01xxxxxx yyyyyyyy. The 14-bit * integer represented by the bits 'xxxxxx' as most significant bits and * 'yyyyyyyy' as least significant bits, plus 1, means that there are N - * registers set to 0. This opcode can represent from 65 to 16384 contiguous + * registers set to 0. This opcode can represent from 0 to 16384 contiguous * registers set to the value of 0. * * VAL opcode is represented as 1vvvvvxx. It contains a 5-bit integer @@ -357,13 +357,30 @@ struct hllhdr { /* Macros to access the sparse representation. * The macros parameter is expected to be an uint8_t pointer. */ +#define HLL_SPARSE_XZERO_BIT 0x40 /* 01xxxxxx */ +#define HLL_SPARSE_VAL_BIT 0x80 /* 1vvvvvxx */ #define HLL_SPARSE_IS_ZERO(p) (((*p) & 0xc0) == 0) /* 00xxxxxx */ -#define HLL_SPARSE_IS_XZERO(p) (((*p) & 0xc0) == 0x40) /* 01xxxxxx */ -#define HLL_SPARSE_IS_VAL(p) ((*p) & 0x80) /* 1vvvvvxx */ -#define HLL_SPARSE_ZERO_LEN(p) ((*p) & 0x3f) -#define HLL_SPARSE_XZERO_LEN(p) ((((*p) & 0x3f) << 6) | (*p)) -#define HLL_SPARSE_VAL_VALUE(p) (((*p) >> 2) & 0x1f) -#define HLL_SPARSE_VAL_LEN(p) ((*p) & 0x3) +#define HLL_SPARSE_IS_XZERO(p) (((*p) & 0xc0) == HLL_SPARSE_XZERO_BIT) +#define HLL_SPARSE_IS_VAL(p) ((*p) & HLL_SPARSE_VAL_BIT) +#define HLL_SPARSE_ZERO_LEN(p) (((*p) & 0x3f)+1) +#define HLL_SPARSE_XZERO_LEN(p) (((((*p) & 0x3f) << 6) | (*p))+1) +#define HLL_SPARSE_VAL_VALUE(p) ((((*p) >> 2) & 0x1f)+1) +#define HLL_SPARSE_VAL_LEN(p) (((*p) & 0x3)+1) +#define HLL_SPARSE_VAL_MAX_VALUE 32 +#define HLL_SPARSE_VAL_MAX_LEN 4 +#define HLL_SPARSE_ZERO_MAX_LEN 64 +#define HLL_SPARSE_XZERO_MAX_LEN 16384 +#define HLL_SPARSE_VAL_SET(p,val,len) do { \ + *(p) = (((val)-1)<<2|((len)-1))|HLL_SPARSE_VAL_BIT; \ +} while(0) +#define HLL_SPARSE_ZERO_SET(p,len) do { \ + *(p) = (len)-1; \ +} while(0) +#define HLL_SPARSE_XZERO_SET(p,len) do { \ + int _l = (len)-1; \ + *(p) = (_l>>8) | HLL_SPARSE_XZERO_BIT; \ + *(p+1) = (_l&0xff); \ +} while(0) /* ========================= HyperLogLog algorithm ========================= */ @@ -538,6 +555,248 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) { /* ================== Sparse representation implementation ================= */ +sds hllSparseToDense(sds *sparse) { + return sdsnew("TODO"); +} + +/* "Add" the element in the sparse hyperloglog data structure. + * Actually nothing is added, but the max 0 pattern counter of the subset + * the element belongs to is incremented if needed. + * + * The object 'o' is the String object holding the HLL. The function requires + * a reference to the object in order to be able to enlarge the string if + * needed. + * + * On success, the function returns 1 if the cardinality changed, or 0 + * if the register for this element was not updated. + * + * As a side effect the function may promote the HLL representation from + * sparse to dense: this happens when a register requires to be set to a value + * not representable with the sparse representation, or when the resulting + * size would be greater than HLL_SPARSE_MAX. */ +int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) { + struct hllhdr *hdr; + uint8_t oldcount, count, *sparse, *end, *p, *prev, *next; + int index, first, span; + int is_zero = 0, is_xzero = 0, is_val = 0, runlen = 0; + + /* Update the register if this element produced a longer run of zeroes. */ + count = hllPatLen(ele,elesize,&index); + + /* If the count is too big to be representable by the sparse representation + * switch to dense representation. */ + if (count > HLL_SPARSE_VAL_MAX_VALUE) goto promote; + + /* When updating a sparse representation, sometimes we may need to + * enlarge the buffer for up to 3 bytes in the worst case (XZERO split + * into XZERO-VAL-XZERO). Make sure there is enough space right now + * so that the pointers we take during the execution of the function + * will be valid all the time. */ + o->ptr = sdsMakeRoomFor(o->ptr,3); + + /* Step 1: we need to locate the opcode we need to modify to check + * if a value update is actually needed. */ + sparse = p = ((uint8_t*)o->ptr) + HLL_HDR_SIZE; + end = p + sdslen(o->ptr) - HLL_HDR_SIZE; + + first = 0; + prev = NULL; /* Points to previos opcode at the end of the loop. */ + next = NULL; /* Points to the next opcode at the end of the loop. */ + while(p < end) { + /* Set span to the number of registers covered by this opcode. */ + if (HLL_SPARSE_IS_ZERO(p)) span = HLL_SPARSE_ZERO_LEN(p); + else if (HLL_SPARSE_IS_XZERO(p)) span = HLL_SPARSE_XZERO_LEN(p); + else span = HLL_SPARSE_VAL_LEN(p); + /* Break if this opcode covers the register as 'index'. */ + if (first+span >= index) break; + prev = p; + p += (HLL_SPARSE_IS_XZERO(p)) ? 2 : 1; + first += span; + } + + next = HLL_SPARSE_IS_XZERO(p) ? p+2 : p+1; + if (next >= end) next = NULL; + + /* Cache current opcode type to avoid using the macro again and + * again for something that will not change. + * Also cache the run-length of the opcode. */ + if (HLL_SPARSE_IS_ZERO(p)) { + is_zero = 1; + runlen = HLL_SPARSE_ZERO_LEN(p); + } else if (HLL_SPARSE_IS_XZERO(p)) { + is_xzero = 1; + runlen = HLL_SPARSE_XZERO_LEN(p); + } else { + is_val = 1; + runlen = HLL_SPARSE_VAL_LEN(p); + } + + /* Step 2: After the loop: + * + * 'first' stores to the index of the first register covered + * by the current opcode, which is pointed by 'p'. + * + * 'next' ad 'prev' store respectively the next and previous opcode, + * or NULL if the opcode at 'p' is respectively the last or first. + * + * 'span' is set to the number of registers covered by the current + * opcode. + * + * There are different cases in order to update the data structure + * in place without generating it from scratch: + * + * A) If it is a VAL opcode already set to a value >= our 'count' + * no update is needed, regardless of the VAL run-length field. + * In this case PFADD returns 0 since no changes are performed. + * + * B) If it is a VAL opcode with len = 1 (representing only our + * register) and the value is less than 'count', we just update it + * since this is a trivial case. */ + if (is_val) { + oldcount = HLL_SPARSE_VAL_VALUE(p); + /* Case A. */ + if (oldcount >= count) return 0; + + /* Case B. */ + if (runlen == 1) { + HLL_SPARSE_VAL_SET(p,count,1); + goto updated; + } + } + + /* C) Another trivial to handle case is a ZERO opcode with a len of 1. + * We can just replace it with a VAL opcode with our value and len of 1. */ + if (is_zero && runlen == 1) { + HLL_SPARSE_VAL_SET(p,count,1); + goto updated; + } + + /* D) General case. + * + * The other cases are more complex: our register requires to be updated + * and is either currently represented by a VAL opcode with len > 1, + * by a ZERO opcode with len > 1, or by an XZERO opcode. + * + * In those cases the original opcode must be split into muliple + * opcodes. The worst case is an XZERO split in the middle resuling into + * XZERO - VAL - XZERO, so the resulting sequence max length is + * 5 bytes. + * + * We perform the split writing the new sequence into the 'new' buffer + * with 'newlen' as length. Later the new sequence is inserted in place + * of the old one, possibly moving what is on the right a few bytes + * if the new sequence is longer than the older one. */ + uint8_t seq[5], *n = seq; + int last = first+span-1; /* Last register covered by the sequence. */ + int len; + + if (is_zero || is_xzero) { + /* Handle splitting of ZERO / XZERO. */ + if (index != first) { + len = index-first; + if (len > HLL_SPARSE_ZERO_MAX_LEN) { + HLL_SPARSE_XZERO_SET(n,len); + n += 2; + } else { + HLL_SPARSE_ZERO_SET(n,len); + n++; + } + } + HLL_SPARSE_VAL_SET(n,count,1); + n++; + if (index != last) { + len = last-index; + if (len > HLL_SPARSE_ZERO_MAX_LEN) { + HLL_SPARSE_XZERO_SET(n,len); + n += 2; + } else { + HLL_SPARSE_ZERO_SET(n,len); + n++; + } + } + } else { + /* Handle splitting of VAL. */ + int curval = HLL_SPARSE_VAL_VALUE(p); + + if (index != first) { + len = index-first; + HLL_SPARSE_VAL_SET(n,curval,len); + n++; + } + HLL_SPARSE_VAL_SET(n,count,1); + n++; + if (index != last) { + len = last-index; + HLL_SPARSE_VAL_SET(n,curval,len); + n++; + } + } + + /* Step 3: substitute the new sequence with the old one. + * + * Note that we already allocated space on the sds string + * calling sdsMakeRoomFor(). */ + int seqlen = seq-n; + int oldlen = is_xzero ? 2 : 1; + int deltalen = seqlen-oldlen; + if (deltalen && next) { + memmove(next+deltalen,next,next-sparse); + sdsIncrLen(o->ptr,deltalen); + } + memcpy(p,seq,seqlen); + +updated: + /* Step 4: Merge adjacent values if possible. + * + * The representation was updated, however the resulting representation + * may not be optimal: adjacent opcodes may be merged into a single one. + * We start from the opcode before the one we updated trying to merge + * opcodes up to the next 5 opcodes (since we need to consider the three + * opcodes resuling from the worst-case split of the updated opcode, + * plus the two opcodes at the left and right of the original one). */ + hdr = o->ptr; + HLL_INVALIDATE_CACHE(hdr); + return 1; + +promote: /* Promote to dense representation. */ + o->ptr = hllSparseToDense(o->ptr); + hdr = o->ptr; + return hllDenseAdd(hdr->registers, ele, elesize); +} + +/* Compute SUM(2^-reg) in the sparse representation. + * PE is an array with a pre-computer table of values 2^-reg indexed by reg. + * As a side effect the integer pointed by 'ezp' is set to the number + * of zero registers. */ +double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp) { + double E = 0; + int ez = 0, idx = 0, runlen, regval; + uint8_t *end = sparse+sparselen, *p = sparse; + + while(p < end) { + /* Set span to the number of registers covered by this opcode. */ + if (HLL_SPARSE_IS_ZERO(p)) { + runlen = HLL_SPARSE_ZERO_LEN(p); + idx += runlen; + ez += runlen; + E += 1; /* 2^(-reg[j]) is 1 when m is 0. */ + } else if (HLL_SPARSE_IS_XZERO(p)) { + runlen = HLL_SPARSE_XZERO_LEN(p); + idx += runlen; + ez += runlen; + E += 1; /* 2^(-reg[j]) is 1 when m is 0. */ + } else { + runlen = HLL_SPARSE_VAL_LEN(p); + regval = HLL_SPARSE_VAL_VALUE(p); + idx += runlen; + E += PE[regval]*runlen; + } + } + redisAssert(idx == HLL_REGISTERS); + *ezp = ez; + return E; +} + /* ========================= HyperLogLog Count ============================== * This is the core of the algorithm where the approximated count is computed. * The function uses the lower level hllDenseSum() and hllSparseSum() functions @@ -545,7 +804,8 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) { * representation-specific, while all the rest is common. */ /* Return the approximated cardinality of the set based on the armonic - * mean of the registers values. */ + * mean of the registers values. 'hdr' points to the start of the SDS + * representing the String object holding the HLL representation. */ uint64_t hllCount(struct hllhdr *hdr) { double m = HLL_REGISTERS; double alpha = 0.7213/(1+1.079/m); @@ -570,7 +830,7 @@ uint64_t hllCount(struct hllhdr *hdr) { if (hdr->encoding == HLL_DENSE) { E = hllDenseSum(hdr->registers,PE,&ez); } else { - E = 0; /* FIXME */ + E = hllSparseSum(hdr->registers,sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez); } /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ -- GitLab