From c04da26c30e274dd6eea755e96056db3d5c7a2ef Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 12 Jan 2021 22:42:29 +0800 Subject: [PATCH] [TD-2634] --- src/query/inc/qPercentile.h | 38 +- src/query/src/qAggMain.c | 2 +- src/query/src/qPercentile.c | 558 +++++++---------------------- src/query/tests/percentileTest.cpp | 254 +++++++++++++ src/util/src/tcompare.c | 55 ++- 5 files changed, 447 insertions(+), 460 deletions(-) create mode 100644 src/query/tests/percentileTest.cpp diff --git a/src/query/inc/qPercentile.h b/src/query/inc/qPercentile.h index c34c24c5b2..f5b770593c 100644 --- a/src/query/inc/qPercentile.h +++ b/src/query/inc/qPercentile.h @@ -16,32 +16,36 @@ #ifndef TDENGINE_QPERCENTILE_H #define TDENGINE_QPERCENTILE_H +#ifdef __cplusplus +extern "C" { +#endif + #include "qExtbuffer.h" #include "qResultbuf.h" #include "qTsbuf.h" typedef struct MinMaxEntry { union { - double dMinVal; - int32_t iMinVal; - int64_t i64MinVal; + double dMinVal; + int64_t i64MinVal; + uint64_t u64MinVal; }; union { double dMaxVal; - int32_t iMaxVal; int64_t i64MaxVal; + int64_t u64MaxVal; }; } MinMaxEntry; typedef struct { - int32_t size; - int32_t pageId; + int32_t size; + int32_t pageId; tFilePage *data; } SSlotInfo; typedef struct tMemBucketSlot { - SSlotInfo info; - MinMaxEntry range; + SSlotInfo info; + MinMaxEntry range; } tMemBucketSlot; struct tMemBucket; @@ -52,16 +56,16 @@ typedef struct tMemBucket { int16_t type; int16_t bytes; int32_t total; - int32_t elemPerPage; // number of elements for each object - int32_t maxCapacity; // maximum allowed number of elements that can be sort directly to get the result - int32_t bufPageSize; // disk page size - MinMaxEntry range; // value range - int32_t times; // count that has been checked for deciding the correct data value buckets. + int32_t elemPerPage; // number of elements for each object + int32_t maxCapacity; // maximum allowed number of elements that can be sort directly to get the result + int32_t bufPageSize; // disk page size + MinMaxEntry range; // value range + int32_t times; // count that has been checked for deciding the correct data value buckets. __compar_fn_t comparFn; - tMemBucketSlot *pSlots; + tMemBucketSlot * pSlots; SDiskbasedResultBuf *pBuffer; - __perc_hash_func_t hashFunc; + __perc_hash_func_t hashFunc; } tMemBucket; tMemBucket *tMemBucketCreate(int16_t nElemSize, int16_t dataType, double minval, double maxval); @@ -73,3 +77,7 @@ int32_t tMemBucketPut(tMemBucket *pBucket, const void *data, size_t size); double getPercentile(tMemBucket *pMemBucket, double percent); #endif // TDENGINE_QPERCENTILE_H + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/src/query/src/qAggMain.c b/src/query/src/qAggMain.c index 0427d65561..3c7fd794bf 100644 --- a/src/query/src/qAggMain.c +++ b/src/query/src/qAggMain.c @@ -2545,7 +2545,7 @@ static void percentile_next_step(SQLFunctionCtx *pCtx) { if (pInfo->numOfElems == 0) { pResInfo->complete = true; } else { - pInfo->pMemBucket = tMemBucketCreate(pCtx->inputBytes, pCtx->inputType, GET_DOUBLE_VAL(&pInfo->minval), GET_DOUBLE_VAL(&pInfo->maxval)); + pInfo->pMemBucket = tMemBucketCreate(pCtx->inputBytes, pCtx->inputType, pInfo->minval, pInfo->maxval); } pInfo->stage += 1; diff --git a/src/query/src/qPercentile.c b/src/query/src/qPercentile.c index 51125d62b9..9d4dde2207 100644 --- a/src/query/src/qPercentile.c +++ b/src/query/src/qPercentile.c @@ -20,6 +20,7 @@ #include "taosdef.h" #include "tulog.h" #include "tcompare.h" +#include "ttype.h" #define DEFAULT_NUM_OF_SLOT 1024 @@ -48,25 +49,15 @@ static tFilePage *loadDataFromFilePage(tMemBucket *pMemBucket, int32_t slotIdx) } static void resetBoundingBox(MinMaxEntry* range, int32_t type) { - switch (type) { - case TSDB_DATA_TYPE_BIGINT: { - range->i64MaxVal = INT64_MIN; - range->i64MinVal = INT64_MAX; - break; - }; - case TSDB_DATA_TYPE_INT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_TINYINT: { - range->iMaxVal = INT32_MIN; - range->iMinVal = INT32_MAX; - break; - }; - case TSDB_DATA_TYPE_DOUBLE: - case TSDB_DATA_TYPE_FLOAT: { - range->dMaxVal = -DBL_MAX; - range->dMinVal = DBL_MAX; - break; - } + if (IS_SIGNED_NUMERIC_TYPE(type)) { + range->i64MaxVal = INT64_MIN; + range->i64MinVal = INT64_MAX; + } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) { + range->u64MaxVal = 0; + range->u64MinVal = UINT64_MAX; + } else { + range->dMaxVal = -DBL_MAX; + range->dMinVal = DBL_MAX; } } @@ -75,23 +66,15 @@ static int32_t setBoundingBox(MinMaxEntry* range, int16_t type, double minval, d return -1; } - switch(type) { - case TSDB_DATA_TYPE_TINYINT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_INT: - range->iMinVal = (int32_t) minval; - range->iMaxVal = (int32_t) maxval; - break; - - case TSDB_DATA_TYPE_BIGINT: - range->i64MinVal = (int64_t) minval; - range->i64MaxVal = (int64_t) maxval; - break; - case TSDB_DATA_TYPE_FLOAT: - case TSDB_DATA_TYPE_DOUBLE: - range->dMinVal = minval; - range->dMaxVal = maxval; - break; + if (IS_SIGNED_NUMERIC_TYPE(type)) { + range->i64MinVal = (int64_t) minval; + range->i64MaxVal = (int64_t) maxval; + } else if (IS_UNSIGNED_NUMERIC_TYPE(type)){ + range->u64MinVal = (uint64_t) minval; + range->u64MaxVal = (uint64_t) maxval; + } else { + range->dMinVal = minval; + range->dMaxVal = maxval; } return 0; @@ -120,117 +103,56 @@ double findOnlyResult(tMemBucket *pMemBucket) { tFilePage* pPage = getResBufPage(pMemBucket->pBuffer, pgInfo->pageId); assert(pPage->num == 1); - switch (pMemBucket->type) { - case TSDB_DATA_TYPE_INT: - return *(int32_t *)pPage->data; - case TSDB_DATA_TYPE_SMALLINT: - return *(int16_t *)pPage->data; - case TSDB_DATA_TYPE_TINYINT: - return *(int8_t *)pPage->data; - case TSDB_DATA_TYPE_BIGINT: - return (double)(*(int64_t *)pPage->data); - case TSDB_DATA_TYPE_DOUBLE: { - double dv = GET_DOUBLE_VAL(pPage->data); - return dv; - } - case TSDB_DATA_TYPE_FLOAT: { - float fv = GET_FLOAT_VAL(pPage->data); - return fv; - } - default: - return 0; - } + double v = 0; + GET_TYPED_DATA(v, double, pMemBucket->type, pPage->data); + return v; } return 0; } -int32_t tBucketBigIntHash(tMemBucket *pBucket, const void *value) { - int64_t v = *(int64_t *)value; - int32_t index = -1; - - int32_t halfSlot = pBucket->numOfSlots >> 1; -// int32_t bits = 32;//bitsOfNumber(pBucket->numOfSlots) - 1; - - if (pBucket->range.i64MaxVal == INT64_MIN) { - if (v >= 0) { - index = (v >> (64 - 9)) + halfSlot; - } else { // v<0 - index = ((-v) >> (64 - 9)); - index = -index + (halfSlot - 1); - } +int32_t tBucketIntHash(tMemBucket *pBucket, const void *value) { + int64_t v = 0; + GET_TYPED_DATA(v, int64_t, pBucket->type, value); - return index; + int32_t index = -1; + // divide the value range into 1024 buckets + uint64_t span = pBucket->range.i64MaxVal - pBucket->range.i64MinVal; + if (span < pBucket->numOfSlots) { + int32_t delta = v - pBucket->range.i64MinVal; + index = (delta % pBucket->numOfSlots); } else { - // out of range - if (v < pBucket->range.i64MinVal || v > pBucket->range.i64MaxVal) { - return -1; + double slotSpan = (double)span / pBucket->numOfSlots; + index = (int32_t)((v - pBucket->range.i64MinVal) / slotSpan); + if (v == pBucket->range.i64MaxVal) { + index -= 1; } - - // todo hash for bigint and float and double - int64_t span = pBucket->range.i64MaxVal - pBucket->range.i64MinVal; - if (span < pBucket->numOfSlots) { - int32_t delta = (int32_t)(v - pBucket->range.i64MinVal); - index = delta % pBucket->numOfSlots; - } else { - double slotSpan = (double)span / pBucket->numOfSlots; - index = (int32_t)((v - pBucket->range.i64MinVal) / slotSpan); - if (v == pBucket->range.i64MaxVal) { - index -= 1; - } - } - - return index; } + + assert(v >= pBucket->range.i64MinVal && v <= pBucket->range.i64MaxVal && index >= 0 && index < pBucket->numOfSlots); + return index; } -// todo refactor to more generic -int32_t tBucketIntHash(tMemBucket *pBucket, const void *value) { - int32_t v = 0; - switch(pBucket->type) { - case TSDB_DATA_TYPE_SMALLINT: v = *(int16_t*) value; break; - case TSDB_DATA_TYPE_TINYINT: v = *(int8_t*) value; break; - default: v = *(int32_t*) value;break; - } +int32_t tBucketUintHash(tMemBucket *pBucket, const void *value) { + int64_t v = 0; + GET_TYPED_DATA(v, uint64_t, pBucket->type, value); int32_t index = -1; - if (pBucket->range.iMaxVal == INT32_MIN) { - /* - * taking negative integer into consideration, - * there is only half of pBucket->segs available for non-negative integer - */ - int32_t halfSlot = pBucket->numOfSlots >> 1; - int32_t bits = 32;//bitsOfNumber(pBucket->numOfSlots) - 1; - - if (v >= 0) { - index = (v >> (bits - 9)) + halfSlot; - } else { // v < 0 - index = ((-v) >> (32 - 9)); - index = -index + (halfSlot - 1); - } - - return index; + // divide the value range into 1024 buckets + uint64_t span = pBucket->range.u64MaxVal - pBucket->range.u64MinVal; + if (span < pBucket->numOfSlots) { + int32_t delta = v - pBucket->range.u64MinVal; + index = (delta % pBucket->numOfSlots); } else { - // out of range - if (v < pBucket->range.iMinVal || v > pBucket->range.iMaxVal) { - return -1; - } - - // divide a range of [iMinVal, iMaxVal] into 1024 buckets - int32_t span = pBucket->range.iMaxVal - pBucket->range.iMinVal; - if (span < pBucket->numOfSlots) { - int32_t delta = v - pBucket->range.iMinVal; - index = (delta % pBucket->numOfSlots); - } else { - double slotSpan = (double)span / pBucket->numOfSlots; - index = (int32_t)((v - pBucket->range.iMinVal) / slotSpan); - if (v == pBucket->range.iMaxVal) { - index -= 1; - } + double slotSpan = (double)span / pBucket->numOfSlots; + index = (int32_t)((v - pBucket->range.u64MinVal) / slotSpan); + if (v == pBucket->range.u64MaxVal) { + index -= 1; } - - return index; } + + assert(v >= pBucket->range.u64MinVal && v <= pBucket->range.i64MaxVal && index >= 0 && index < pBucket->numOfSlots); + return index; } int32_t tBucketDoubleHash(tMemBucket *pBucket, const void *value) { @@ -243,62 +165,30 @@ int32_t tBucketDoubleHash(tMemBucket *pBucket, const void *value) { int32_t index = -1; - if (pBucket->range.dMinVal == DBL_MAX) { - /* - * taking negative integer into consideration, - * there is only half of pBucket->segs available for non-negative integer - */ - double x = DBL_MAX / (pBucket->numOfSlots >> 1); - double posx = (v + DBL_MAX) / x; - return ((int32_t)posx) % pBucket->numOfSlots; + // divide a range of [dMinVal, dMaxVal] into 1024 buckets + double span = pBucket->range.dMaxVal - pBucket->range.dMinVal; + if (span < pBucket->numOfSlots) { + int32_t delta = (int32_t)(v - pBucket->range.dMinVal); + index = (delta % pBucket->numOfSlots); } else { - - // out of range - if (v < pBucket->range.dMinVal || v > pBucket->range.dMaxVal) { - return -1; + double slotSpan = span / pBucket->numOfSlots; + index = (int32_t)((v - pBucket->range.dMinVal) / slotSpan); + if (v == pBucket->range.dMaxVal) { + index -= 1; } - - // divide a range of [dMinVal, dMaxVal] into 1024 buckets - double span = pBucket->range.dMaxVal - pBucket->range.dMinVal; - if (span < pBucket->numOfSlots) { - int32_t delta = (int32_t)(v - pBucket->range.dMinVal); - index = (delta % pBucket->numOfSlots); - } else { - double slotSpan = span / pBucket->numOfSlots; - index = (int32_t)((v - pBucket->range.dMinVal) / slotSpan); - if (v == pBucket->range.dMaxVal) { - index -= 1; - } - } - - if (index < 0 || index > pBucket->numOfSlots) { - uError("error in hash process. slot id: %d", index); - } - - return index; } + + assert(v >= pBucket->range.dMinVal && v <= pBucket->range.dMaxVal && index >= 0 && index < pBucket->numOfSlots); + return index; } static __perc_hash_func_t getHashFunc(int32_t type) { - switch (type) { - case TSDB_DATA_TYPE_INT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_TINYINT: { - return tBucketIntHash; - }; - - case TSDB_DATA_TYPE_DOUBLE: - case TSDB_DATA_TYPE_FLOAT: { - return tBucketDoubleHash; - }; - - case TSDB_DATA_TYPE_BIGINT: { - return tBucketBigIntHash; - }; - - default: { - return NULL; - } + if (IS_SIGNED_NUMERIC_TYPE(type)) { + return tBucketIntHash; + } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) { + return tBucketUintHash; + } else { + return tBucketDoubleHash; } } @@ -372,77 +262,41 @@ void tMemBucketDestroy(tMemBucket *pBucket) { } void tMemBucketUpdateBoundingBox(MinMaxEntry *r, const char *data, int32_t dataType) { - switch (dataType) { - case TSDB_DATA_TYPE_INT: { - int32_t val = *(int32_t *)data; - if (r->iMinVal > val) { - r->iMinVal = val; - } + if (IS_SIGNED_NUMERIC_TYPE(dataType)) { + int64_t v = 0; + GET_TYPED_DATA(v, int64_t, dataType, data); - if (r->iMaxVal < val) { - r->iMaxVal = val; - } - break; - }; - case TSDB_DATA_TYPE_BIGINT: { - int64_t val = *(int64_t *)data; - if (r->i64MinVal > val) { - r->i64MinVal = val; - } - - if (r->i64MaxVal < val) { - r->i64MaxVal = val; - } - break; - }; - case TSDB_DATA_TYPE_SMALLINT: { - int32_t val = *(int16_t *)data; - if (r->iMinVal > val) { - r->iMinVal = val; - } - - if (r->iMaxVal < val) { - r->iMaxVal = val; - } - break; - }; - case TSDB_DATA_TYPE_TINYINT: { - int32_t val = *(int8_t *)data; - if (r->iMinVal > val) { - r->iMinVal = val; - } + if (r->i64MinVal > v) { + r->i64MinVal = v; + } - if (r->iMaxVal < val) { - r->iMaxVal = val; - } + if (r->i64MaxVal < v) { + r->i64MaxVal = v; + } + } else if (IS_UNSIGNED_NUMERIC_TYPE(dataType)) { + uint64_t v = 0; + GET_TYPED_DATA(v, uint64_t, dataType, data); - break; - }; - case TSDB_DATA_TYPE_DOUBLE: { - // double val = *(double *)data; - double val = GET_DOUBLE_VAL(data); - if (r->dMinVal > val) { - r->dMinVal = val; - } + if (r->i64MinVal > v) { + r->i64MinVal = v; + } - if (r->dMaxVal < val) { - r->dMaxVal = val; - } - break; - }; - case TSDB_DATA_TYPE_FLOAT: { - double val = GET_FLOAT_VAL(data); + if (r->i64MaxVal < v) { + r->i64MaxVal = v; + } + } else if (IS_FLOAT_TYPE(dataType)) { + double v = 0; + GET_TYPED_DATA(v, double, dataType, data); - if (r->dMinVal > val) { - r->dMinVal = val; - } + if (r->dMinVal > v) { + r->dMinVal = v; + } - if (r->dMaxVal < val) { - r->dMaxVal = val; - } - break; - }; - default: { assert(false); } + if (r->dMaxVal < v) { + r->dMaxVal = v; + } + } else { + assert(0); } } @@ -452,16 +306,13 @@ void tMemBucketUpdateBoundingBox(MinMaxEntry *r, const char *data, int32_t dataT int32_t tMemBucketPut(tMemBucket *pBucket, const void *data, size_t size) { assert(pBucket != NULL && data != NULL && size > 0); - pBucket->total += (int32_t)size; - + int32_t count = 0; int32_t bytes = pBucket->bytes; for (int32_t i = 0; i < size; ++i) { char *d = (char *) data + i * bytes; + count += 1; int32_t index = (pBucket->hashFunc)(pBucket, d); - if (index == -1) { // the value is out of range, do not add it into bucket - return -1; - } tMemBucketSlot *pSlot = &pBucket->pSlots[index]; tMemBucketUpdateBoundingBox(&pSlot->range, d, pBucket->type); @@ -489,64 +340,11 @@ int32_t tMemBucketPut(tMemBucket *pBucket, const void *data, size_t size) { pSlot->info.size += 1; } + pBucket->total += count; return 0; } //////////////////////////////////////////////////////////////////////////////////////////// -static UNUSED_FUNC void findMaxMinValue(tMemBucket *pMemBucket, double *maxVal, double *minVal) { - *minVal = DBL_MAX; - *maxVal = -DBL_MAX; - - for (int32_t i = 0; i < pMemBucket->numOfSlots; ++i) { - tMemBucketSlot *pSlot = &pMemBucket->pSlots[i]; - if (pSlot->info.size == 0) { - continue; - } - - switch (pMemBucket->type) { - case TSDB_DATA_TYPE_INT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_TINYINT: { - double minv = pSlot->range.iMinVal; - double maxv = pSlot->range.iMaxVal; - - if (*minVal > minv) { - *minVal = minv; - } - if (*maxVal < maxv) { - *maxVal = maxv; - } - break; - } - case TSDB_DATA_TYPE_DOUBLE: - case TSDB_DATA_TYPE_FLOAT: { - double minv = pSlot->range.dMinVal; - double maxv = pSlot->range.dMaxVal; - - if (*minVal > minv) { - *minVal = minv; - } - if (*maxVal < maxv) { - *maxVal = maxv; - } - break; - } - case TSDB_DATA_TYPE_BIGINT: { - double minv = (double)pSlot->range.i64MinVal; - double maxv = (double)pSlot->range.i64MaxVal; - - if (*minVal > minv) { - *minVal = minv; - } - if (*maxVal < maxv) { - *maxVal = maxv; - } - break; - } - } - } -} - /* * * now, we need to find the minimum value of the next slot for @@ -565,7 +363,6 @@ static MinMaxEntry getMinMaxEntryOfNextSlotWithData(tMemBucket *pMemBucket, int3 } static bool isIdenticalData(tMemBucket *pMemBucket, int32_t index); -char *getFirstElemOfMemBuffer(tMemBucketSlot *pSeg, int32_t slotIdx, tFilePage *pPage); static double getIdenticalDataVal(tMemBucket* pMemBucket, int32_t slotIndex) { assert(isIdenticalData(pMemBucket, slotIndex)); @@ -573,24 +370,12 @@ static double getIdenticalDataVal(tMemBucket* pMemBucket, int32_t slotIndex) { tMemBucketSlot *pSlot = &pMemBucket->pSlots[slotIndex]; double finalResult = 0.0; - switch (pMemBucket->type) { - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_TINYINT: - case TSDB_DATA_TYPE_INT: { - finalResult = pSlot->range.iMinVal; - break; - } - - case TSDB_DATA_TYPE_FLOAT: - case TSDB_DATA_TYPE_DOUBLE: { - finalResult = pSlot->range.dMinVal; - break; - }; - - case TSDB_DATA_TYPE_BIGINT: { - finalResult = (double)pSlot->range.i64MinVal; - break; - } + if (IS_SIGNED_NUMERIC_TYPE(pMemBucket->type)) { + finalResult = pSlot->range.i64MinVal; + } else if (IS_UNSIGNED_NUMERIC_TYPE(pMemBucket->type)) { + finalResult = pSlot->range.u64MinVal; + } else { + finalResult = pSlot->range.dMinVal; } return finalResult; @@ -616,26 +401,16 @@ double getPercentileImpl(tMemBucket *pMemBucket, int32_t count, double fraction) double maxOfThisSlot = 0; double minOfNextSlot = 0; - switch (pMemBucket->type) { - case TSDB_DATA_TYPE_INT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_TINYINT: { - maxOfThisSlot = pSlot->range.iMaxVal; - minOfNextSlot = next.iMinVal; - break; - }; - case TSDB_DATA_TYPE_FLOAT: - case TSDB_DATA_TYPE_DOUBLE: { - maxOfThisSlot = pSlot->range.dMaxVal; - minOfNextSlot = next.dMinVal; - break; - }; - case TSDB_DATA_TYPE_BIGINT: { - maxOfThisSlot = (double)pSlot->range.i64MaxVal; - minOfNextSlot = (double)next.i64MinVal; - break; - } - }; + if (IS_SIGNED_NUMERIC_TYPE(pMemBucket->type)) { + maxOfThisSlot = pSlot->range.i64MaxVal; + minOfNextSlot = next.i64MinVal; + } else if (IS_UNSIGNED_NUMERIC_TYPE(pMemBucket->type)) { + maxOfThisSlot = pSlot->range.u64MaxVal; + minOfNextSlot = next.u64MinVal; + } else { + maxOfThisSlot = pSlot->range.dMaxVal; + minOfNextSlot = next.dMinVal; + } assert(minOfNextSlot > maxOfThisSlot); @@ -652,38 +427,8 @@ double getPercentileImpl(tMemBucket *pMemBucket, int32_t count, double fraction) char *nextVal = thisVal + pMemBucket->bytes; double td = 1.0, nd = 1.0; - switch (pMemBucket->type) { - case TSDB_DATA_TYPE_SMALLINT: { - td = *(int16_t *)thisVal; - nd = *(int16_t *)nextVal; - break; - } - case TSDB_DATA_TYPE_TINYINT: { - td = *(int8_t *)thisVal; - nd = *(int8_t *)nextVal; - break; - } - case TSDB_DATA_TYPE_INT: { - td = *(int32_t *)thisVal; - nd = *(int32_t *)nextVal; - break; - }; - case TSDB_DATA_TYPE_FLOAT: { - td = GET_FLOAT_VAL(thisVal); - nd = GET_FLOAT_VAL(nextVal); - break; - } - case TSDB_DATA_TYPE_DOUBLE: { - td = GET_DOUBLE_VAL(thisVal); - nd = GET_DOUBLE_VAL(nextVal); - break; - } - case TSDB_DATA_TYPE_BIGINT: { - td = (double)*(int64_t *)thisVal; - nd = (double)*(int64_t *)nextVal; - break; - } - } + GET_TYPED_DATA(td, double, pMemBucket->type, thisVal); + GET_TYPED_DATA(nd, double, pMemBucket->type, nextVal); double val = (1 - fraction) * td + fraction * nd; tfree(buffer); @@ -741,20 +486,14 @@ double getPercentile(tMemBucket *pMemBucket, double percent) { if (fabs(percent - 100.0) < DBL_EPSILON || (percent < DBL_EPSILON)) { MinMaxEntry* pRange = &pMemBucket->range; - switch(pMemBucket->type) { - case TSDB_DATA_TYPE_TINYINT: - case TSDB_DATA_TYPE_SMALLINT: - case TSDB_DATA_TYPE_INT: - return fabs(percent - 100) < DBL_EPSILON? pRange->iMaxVal:pRange->iMinVal; - case TSDB_DATA_TYPE_BIGINT: { - double v = (double)(fabs(percent - 100) < DBL_EPSILON ? pRange->i64MaxVal : pRange->i64MinVal); - return v; - } - case TSDB_DATA_TYPE_FLOAT: - case TSDB_DATA_TYPE_DOUBLE: - return fabs(percent - 100) < DBL_EPSILON? pRange->dMaxVal:pRange->dMinVal; - default: - return -1; + if (IS_SIGNED_NUMERIC_TYPE(pMemBucket->type)) { + double v = (double)(fabs(percent - 100) < DBL_EPSILON ? pRange->i64MaxVal : pRange->i64MinVal); + return v; + } else if (IS_UNSIGNED_NUMERIC_TYPE(pMemBucket->type)) { + double v = (double)(fabs(percent - 100) < DBL_EPSILON ? pRange->u64MaxVal : pRange->u64MinVal); + return v; + } else { + return fabs(percent - 100) < DBL_EPSILON? pRange->dMaxVal:pRange->dMinVal; } } @@ -771,40 +510,9 @@ double getPercentile(tMemBucket *pMemBucket, double percent) { bool isIdenticalData(tMemBucket *pMemBucket, int32_t index) { tMemBucketSlot *pSeg = &pMemBucket->pSlots[index]; - if (pMemBucket->type == TSDB_DATA_TYPE_INT || pMemBucket->type == TSDB_DATA_TYPE_BIGINT || - pMemBucket->type == TSDB_DATA_TYPE_SMALLINT || pMemBucket->type == TSDB_DATA_TYPE_TINYINT) { - return pSeg->range.i64MinVal == pSeg->range.i64MaxVal; - } - - if (pMemBucket->type == TSDB_DATA_TYPE_FLOAT || pMemBucket->type == TSDB_DATA_TYPE_DOUBLE) { + if (IS_FLOAT_TYPE(pMemBucket->type)) { return fabs(pSeg->range.dMaxVal - pSeg->range.dMinVal) < DBL_EPSILON; + } else { + return pSeg->range.i64MinVal == pSeg->range.i64MaxVal; } - - return false; -} - -/* - * get the first element of one slot into memory. - * if no data of current slot in memory, load it from disk - */ -char *getFirstElemOfMemBuffer(tMemBucketSlot *pSeg, int32_t slotIdx, tFilePage *pPage) { -// STSBuf *pMemBuffer = pSeg->pBuffer[slotIdx]; - char *thisVal = NULL; - -// if (pSeg->pBuffer[slotIdx]->numOfTotal != 0) { -//// thisVal = pSeg->pBuffer[slotIdx]->pHead->item.data; -// } else { -// /* -// * no data in memory, load one page into memory -// */ -// tFlushoutInfo *pFlushInfo = &pMemBuffer->fileMeta.flushoutData.pFlushoutInfo[0]; -// assert(pFlushInfo->numOfPages == pMemBuffer->fileMeta.nFileSize); -// int32_t ret; -// ret = fseek(pMemBuffer->file, pFlushInfo->startPageId * pMemBuffer->pageSize, SEEK_SET); -// UNUSED(ret); -// size_t sz = fread(pPage, pMemBuffer->pageSize, 1, pMemBuffer->file); -// UNUSED(sz); -// thisVal = pPage->data; -// } - return thisVal; -} +} \ No newline at end of file diff --git a/src/query/tests/percentileTest.cpp b/src/query/tests/percentileTest.cpp new file mode 100644 index 0000000000..f1fc458501 --- /dev/null +++ b/src/query/tests/percentileTest.cpp @@ -0,0 +1,254 @@ +#include +#include + +#include "qResultbuf.h" +#include "taos.h" +#include "taosdef.h" + +#include "qPercentile.h" + +namespace { +tMemBucket *createBigIntDataBucket(int32_t start, int32_t end) { + tMemBucket *pBucket = tMemBucketCreate(sizeof(int64_t), TSDB_DATA_TYPE_BIGINT, start, end); + for (int32_t i = start; i <= end; ++i) { + int64_t val = i; + tMemBucketPut(pBucket, &val, 1); + } + + return pBucket; +} + +tMemBucket *createIntDataBucket(int32_t start, int32_t end) { + tMemBucket *pBucket = tMemBucketCreate(sizeof(int32_t), TSDB_DATA_TYPE_INT, start, end); + + for (int32_t i = start; i <= end; ++i) { + int32_t val = i; + tMemBucketPut(pBucket, &val, 1); + } + + return pBucket; +} + +tMemBucket *createDoubleDataBucket(int32_t start, int32_t end) { + tMemBucket *pBucket = tMemBucketCreate(sizeof(double), TSDB_DATA_TYPE_DOUBLE, start, end); + for (int32_t i = start; i <= end; ++i) { + double val = i; + int32_t ret = tMemBucketPut(pBucket, &val, 1); + if (ret != 0) { + printf("value out of range:%f", val); + } + } + + return pBucket; +} + +tMemBucket *createUnsignedDataBucket(int32_t start, int32_t end, int32_t type) { + tMemBucket *pBucket = tMemBucketCreate(tDataTypeDesc[type].nSize, type, start, end); + for (int32_t i = start; i <= end; ++i) { + uint64_t k = i; + int32_t ret = tMemBucketPut(pBucket, &k, 1); + if (ret != 0) { + printf("value out of range:%f", k); + } + } + + return pBucket; +} + +void intDataTest() { + printf("running %s\n", __FUNCTION__); + + tMemBucket *pBucket = NULL; + double result = 0.; + + pBucket = createIntDataBucket(0, 0); + result = getPercentile(pBucket, 0); + ASSERT_DOUBLE_EQ(result, 0); + tMemBucketDestroy(pBucket); + + pBucket = createIntDataBucket(0, 1); + result = getPercentile(pBucket, 100); + ASSERT_DOUBLE_EQ(result, 1); + + result = getPercentile(pBucket, 0); + ASSERT_DOUBLE_EQ(result, 0); + tMemBucketDestroy(pBucket); + + pBucket = createIntDataBucket(-1, 1); + + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 0); + + result = getPercentile(pBucket, 0); + ASSERT_DOUBLE_EQ(result, -1); + + result = getPercentile(pBucket, 75); + ASSERT_DOUBLE_EQ(result, 0.5); + + result = getPercentile(pBucket, 100); + ASSERT_DOUBLE_EQ(result, 1); + tMemBucketDestroy(pBucket); + + pBucket = createIntDataBucket(0, 99999); + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 49999.5); + + tMemBucketDestroy(pBucket); +} + +void bigintDataTest() { + printf("running %s\n", __FUNCTION__); + + tMemBucket *pBucket = NULL; + double result = 0.0; + + pBucket = createBigIntDataBucket(-1000, 1000); + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 0.); + tMemBucketDestroy(pBucket); + + pBucket = createBigIntDataBucket(-10000, 10000); + result = getPercentile(pBucket, 100); + ASSERT_DOUBLE_EQ(result, 10000.0); + tMemBucketDestroy(pBucket); + + pBucket = createBigIntDataBucket(-10000, 10000); + result = getPercentile(pBucket, 75); + ASSERT_DOUBLE_EQ(result, 5000.0); + + tMemBucketDestroy(pBucket); +} + +void doubleDataTest() { + printf("running %s\n", __FUNCTION__); + + tMemBucket *pBucket = NULL; + double result = 0; + + pBucket = createDoubleDataBucket(-10, 10); + result = getPercentile(pBucket, 0); + ASSERT_DOUBLE_EQ(result, -10.0); + + printf("result is: %lf\n", result); + tMemBucketDestroy(pBucket); + + pBucket = createDoubleDataBucket(-100000, 100000); + result = getPercentile(pBucket, 25); + ASSERT_DOUBLE_EQ(result, -50000); + + printf("result is: %lf\n", result); + + tMemBucketDestroy(pBucket); + + pBucket = createDoubleDataBucket(-100000, 100000); + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 0); + + tMemBucketDestroy(pBucket); + + pBucket = createDoubleDataBucket(-100000, 100000); + result = getPercentile(pBucket, 75); + ASSERT_DOUBLE_EQ(result, 50000); + tMemBucketDestroy(pBucket); + + pBucket = createDoubleDataBucket(-100000, 100000); + + result = getPercentile(pBucket, 100); + ASSERT_DOUBLE_EQ(result, 100000.0); + + printf("result is: %lf\n", result); + tMemBucketDestroy(pBucket); +} + +/* + * large data test, we employ 0.1billion double data to calculated the percentile + * which is 800MB data + */ +void largeDataTest() { + printf("running : %s\n", __FUNCTION__); + + tMemBucket *pBucket = NULL; + double result = 0; + + struct timeval tv; + gettimeofday(&tv, NULL); + + int64_t start = tv.tv_sec; + printf("start time: %" PRId64 "\n", tv.tv_sec); + pBucket = createDoubleDataBucket(0, 100000000); + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 50000000); + + gettimeofday(&tv, NULL); + + printf("total elapsed time: %" PRId64 " sec.", -start + tv.tv_sec); + printf("the result of %d is: %lf\n", 50, result); + tMemBucketDestroy(pBucket); +} + +void qsortTest() { + printf("running : %s\n", __FUNCTION__); + + SSchema field[1] = { + {TSDB_DATA_TYPE_INT, "k", sizeof(int32_t)}, + }; + + const int32_t num = 2000; + + int32_t *d = (int32_t *)malloc(sizeof(int32_t) * num); + for (int32_t i = 0; i < num; ++i) { + d[i] = i % 4; + } + + const int32_t numOfOrderCols = 1; + int32_t orderColIdx = 0; + SColumnModel * pModel = createColumnModel(field, 1, 1000); + tOrderDescriptor *pDesc = tOrderDesCreate(&orderColIdx, numOfOrderCols, pModel, 1); + + tColDataQSort(pDesc, num, 0, num - 1, (char *)d, 1); + + for (int32_t i = 0; i < num; ++i) { + printf("%d\t", d[i]); + } + printf("\n"); + + destroyColumnModel(pModel); +} + +void unsignedDataTest() { + printf("running %s\n", __FUNCTION__); + + tMemBucket *pBucket = NULL; + double result = 0.0; + + pBucket = createUnsignedDataBucket(0, 1000, TSDB_DATA_TYPE_UINT); + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 500.0); + tMemBucketDestroy(pBucket); + + pBucket = createUnsignedDataBucket(0, 10000, TSDB_DATA_TYPE_UBIGINT); + result = getPercentile(pBucket, 100); + ASSERT_DOUBLE_EQ(result, 10000.0); + + result = getPercentile(pBucket, 0); + ASSERT_DOUBLE_EQ(result, 0.0); + + result = getPercentile(pBucket, 50); + ASSERT_DOUBLE_EQ(result, 5000); + + result = getPercentile(pBucket, 75); + ASSERT_DOUBLE_EQ(result, 7500); + tMemBucketDestroy(pBucket); + +} + +} // namespace + +TEST(testCase, percentileTest) { + // qsortTest(); + intDataTest(); + bigintDataTest(); + doubleDataTest(); + unsignedDataTest(); + largeDataTest(); +} diff --git a/src/util/src/tcompare.c b/src/util/src/tcompare.c index de6fbe7302..75ac930723 100644 --- a/src/util/src/tcompare.c +++ b/src/util/src/tcompare.c @@ -30,24 +30,32 @@ int32_t compareInt8Val(const void *pLeft, const void *pRight) { return 0; } -int32_t compareIntDoubleVal(const void *pLeft, const void *pRight) { - int64_t lhs = GET_INT64_VAL(pLeft); - double rhs = GET_DOUBLE_VAL(pRight); - if (fabs(lhs - rhs) < FLT_EPSILON) { - return 0; - } else { - return (lhs > rhs) ? 1 : -1; - } +int32_t compareUint32Val(const void *pLeft, const void *pRight) { + int32_t left = GET_UINT32_VAL(pLeft), right = GET_UINT32_VAL(pRight); + if (left > right) return 1; + if (left < right) return -1; + return 0; } -int32_t compareDoubleIntVal(const void *pLeft, const void *pRight) { - double lhs = GET_DOUBLE_VAL(pLeft); - int64_t rhs = GET_INT64_VAL(pRight); - if (fabs(lhs - rhs) < FLT_EPSILON) { - return 0; - } else { - return (lhs > rhs) ? 1 : -1; - } +int32_t compareUint64Val(const void *pLeft, const void *pRight) { + int64_t left = GET_UINT64_VAL(pLeft), right = GET_UINT64_VAL(pRight); + if (left > right) return 1; + if (left < right) return -1; + return 0; +} + +int32_t compareUint16Val(const void *pLeft, const void *pRight) { + int16_t left = GET_UINT16_VAL(pLeft), right = GET_UINT16_VAL(pRight); + if (left > right) return 1; + if (left < right) return -1; + return 0; +} + +int32_t compareUint8Val(const void* pLeft, const void* pRight) { + uint8_t left = GET_UINT8_VAL(pLeft), right = GET_UINT8_VAL(pRight); + if (left > right) return 1; + if (left < right) return -1; + return 0; } int32_t compareFloatVal(const void *pLeft, const void *pRight) { @@ -369,15 +377,24 @@ __compar_fn_t getKeyComparFunc(int32_t keyType) { case TSDB_DATA_TYPE_DOUBLE: comparFn = compareDoubleVal; break; - + case TSDB_DATA_TYPE_UTINYINT: + comparFn = compareUint8Val; + break; + case TSDB_DATA_TYPE_USMALLINT: + comparFn = compareUint16Val; + break; + case TSDB_DATA_TYPE_UINT: + comparFn = compareUint32Val; + break; + case TSDB_DATA_TYPE_UBIGINT: + comparFn = compareUint64Val; + break; case TSDB_DATA_TYPE_BINARY: comparFn = compareLenPrefixedStr; break; - case TSDB_DATA_TYPE_NCHAR: comparFn = compareLenPrefixedWStr; break; - default: comparFn = compareInt32Val; break; -- GitLab