未验证 提交 584af372 编写于 作者: D dapan1121 提交者: GitHub

Merge pull request #20124 from taosdata/feat/TS-2671-2.6

feat(query): optimize percentile function performance
...@@ -569,18 +569,23 @@ Query OK, 2 row(s) in set (0.000793s) ...@@ -569,18 +569,23 @@ Query OK, 2 row(s) in set (0.000793s)
### PERCENTILE ### PERCENTILE
``` ```
SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause]; SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
``` ```
**Description**: The value whose rank in a specific column matches the specified percentage. If such a value matching the specified percentage doesn't exist in the column, an interpolation value will be returned. **Description**: The value whose rank in a specific column matches the specified percentage. If such a value matching the specified percentage doesn't exist in the column, an interpolation value will be returned.
**Return value type**: Double precision floating point **Return value type**: This function takes 2 minumum and 11 maximum parameters, and it can simultaneously return 10 percentiles at most. If 2 parameters are given, a single percentile is returned and the value type is DOUBLE.
If more than 2 parameters are given, the return value type is a VARCHAR string, the format of which is a JSON ARRAY containing all return values.
**Applicable column types**: Data types except for timestamp, binary, nchar and bool **Applicable column types**: Data types except for timestamp, binary, nchar and bool
**Applicable table types**: table **Applicable table types**: table
**More explanations**: _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX. **More explanations**:
- _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX.
- When calculating multiple percentiles of a specific column, a single PERCENTILE function with multiple parameters is adviced, as this can largely reduce the query response time.
For example, using SELECT percentile(col, 90, 95, 99) FROM table will perform better than SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table.
**Examples**: **Examples**:
......
...@@ -564,18 +564,22 @@ Query OK, 2 row(s) in set (0.000793s) ...@@ -564,18 +564,22 @@ Query OK, 2 row(s) in set (0.000793s)
### PERCENTILE ### PERCENTILE
``` ```
SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause]; SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
``` ```
**功能说明**:统计表中某列的值百分比分位数。 **功能说明**:统计表中某列的值百分比分位数。
**返回数据类型** 双精度浮点数 Double **返回数据类型**该函数最小参数个数为 2 个,最大参数个数为 11 个。可以最多同时返回 10 个百分比分位数。当参数个数为 2 时, 返回一个分位数,类型为DOUBLE,当参数个数大于 2 时,返回类型为VARCHAR, 格式为包含多个返回值的JSON数组
**应用字段**:不能应用在 timestamp、binary、nchar、bool 类型字段。 **应用字段**:不能应用在 timestamp、binary、nchar、bool 类型字段。
**适用于**:表。 **适用于**:表。
**使用说明***P*值取值范围 0≤*P*≤100,为 0 的时候等同于 MIN,为 100 的时候等同于 MAX。 **使用说明**
- *P*值取值范围 0≤*P*≤100,为 0 的时候等同于 MIN,为 100 的时候等同于 MAX。
- 同时计算针对同一列的多个分位数时,建议使用一个PERCENTILE函数和多个参数的方式,能很大程度上降低查询的响应时间。
比如,使用查询SELECT percentile(col, 90, 95, 99) FROM table, 性能会优于SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table。
**示例** **示例**
......
...@@ -3289,9 +3289,12 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col ...@@ -3289,9 +3289,12 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
} else if (functionId == TSDB_FUNC_APERCT || functionId == TSDB_FUNC_TAIL) { } else if (functionId == TSDB_FUNC_APERCT || functionId == TSDB_FUNC_TAIL) {
size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList); size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
if (cnt != 2 && cnt != 3) valid = false; if (cnt != 2 && cnt != 3) valid = false;
} else if (functionId == TSDB_FUNC_PERCT) {
size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
if (cnt < 2 || cnt > 11) valid = false;
} else if (functionId == TSDB_FUNC_UNIQUE) { } else if (functionId == TSDB_FUNC_UNIQUE) {
if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 1) valid = false; if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 1) valid = false;
}else { } else {
if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 2) valid = false; if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 2) valid = false;
} }
if (!valid) { if (!valid) {
...@@ -3332,7 +3335,7 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col ...@@ -3332,7 +3335,7 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
} }
tVariant* pVariant = NULL; tVariant* pVariant = NULL;
if (functionId != TSDB_FUNC_UNIQUE) { if (functionId != TSDB_FUNC_UNIQUE && functionId != TSDB_FUNC_PERCT) {
// 3. valid the parameters // 3. valid the parameters
if (pParamElem[1].pNode->tokenId == TK_ID) { if (pParamElem[1].pNode->tokenId == TK_ID) {
return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg2); return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg2);
...@@ -3348,7 +3351,31 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col ...@@ -3348,7 +3351,31 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
char val[8] = {0}; char val[8] = {0};
SExprInfo* pExpr = NULL; SExprInfo* pExpr = NULL;
if (functionId == TSDB_FUNC_PERCT || functionId == TSDB_FUNC_APERCT) { if (functionId == TSDB_FUNC_PERCT) {
int32_t numOfParams = (int32_t)taosArrayGetSize(pItem->pNode->Expr.paramList);
getResultDataInfo(pSchema->type, pSchema->bytes, functionId, numOfParams - 1, &resultType, &resultSize, &interResult, 0,
false, pUdfInfo);
pExpr = tscExprAppend(pQueryInfo, functionId, &idx, resultType, resultSize, getNewResColId(pCmd),
interResult, false);
for (int32_t i = 1; i < numOfParams; ++i) {
pVariant = &pParamElem[i].pNode->value;
if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
}
tVariantDump(pVariant, val, TSDB_DATA_TYPE_DOUBLE, true);
double dp = GET_DOUBLE_VAL(val);
if (dp < 0 || dp > TOP_BOTTOM_QUERY_LIMIT) {
return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
}
tscExprAddParams(&pExpr->base, val, TSDB_DATA_TYPE_DOUBLE, sizeof(double));
}
tscInsertPrimaryTsSourceColumn(pQueryInfo, pTableMetaInfo->pTableMeta->id.uid);
colIndex += 1; // the first column is ts
} else if (functionId == TSDB_FUNC_APERCT) {
// param1 double // param1 double
if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) { if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5); return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
......
...@@ -2780,7 +2780,7 @@ void tscExprAddParams(SSqlExpr* pExpr, char* argument, int32_t type, int32_t byt ...@@ -2780,7 +2780,7 @@ void tscExprAddParams(SSqlExpr* pExpr, char* argument, int32_t type, int32_t byt
tVariantCreateFromBinary(&pExpr->param[pExpr->numOfParams], argument, bytes, type); tVariantCreateFromBinary(&pExpr->param[pExpr->numOfParams], argument, bytes, type);
pExpr->numOfParams += 1; pExpr->numOfParams += 1;
assert(pExpr->numOfParams <= 3); assert(pExpr->numOfParams <= 10);
} }
SExprInfo* tscExprGet(SQueryInfo* pQueryInfo, int32_t idx) { SExprInfo* tscExprGet(SQueryInfo* pQueryInfo, int32_t idx) {
......
...@@ -59,7 +59,7 @@ typedef struct SSqlExpr { ...@@ -59,7 +59,7 @@ typedef struct SSqlExpr {
// pQueryAttr->interBytesForGlobal // pQueryAttr->interBytesForGlobal
int16_t numOfParams; // argument value of each function int16_t numOfParams; // argument value of each function
tVariant param[3]; // parameters are not more than 3 tVariant param[10]; // parameters are not more than 10
int32_t offset; // sub result column value of arithmetic expression. int32_t offset; // sub result column value of arithmetic expression.
int16_t resColId; // result column id int16_t resColId; // result column id
......
...@@ -210,7 +210,7 @@ typedef struct SQLFunctionCtx { ...@@ -210,7 +210,7 @@ typedef struct SQLFunctionCtx {
int64_t startTs; // timestamp range of current query when function is executed on a specific data block int64_t startTs; // timestamp range of current query when function is executed on a specific data block
int64_t endTs; int64_t endTs;
int32_t numOfParams; int32_t numOfParams;
tVariant param[4]; // input parameter, e.g., top(k, 20), the number of results for top query is kept in param tVariant param[10]; // input parameter, e.g., top(k, 20), the number of results for top query is kept in param
int64_t *ptsList; // corresponding timestamp array list int64_t *ptsList; // corresponding timestamp array list
void *ptsOutputBuf; // corresponding output buffer for timestamp of each result, e.g., top/bottom*/ void *ptsOutputBuf; // corresponding output buffer for timestamp of each result, e.g., top/bottom*/
SQLPreAggVal preAggVals; SQLPreAggVal preAggVals;
......
...@@ -700,8 +700,13 @@ int32_t getResultDataInfo(int32_t dataType, int32_t dataBytes, int32_t functionI ...@@ -700,8 +700,13 @@ int32_t getResultDataInfo(int32_t dataType, int32_t dataBytes, int32_t functionI
*bytes = sizeof(double); *bytes = sizeof(double);
*interBytes = sizeof(SSpreadInfo); *interBytes = sizeof(SSpreadInfo);
} else if (functionId == TSDB_FUNC_PERCT) { } else if (functionId == TSDB_FUNC_PERCT) {
if (param > 1) {
*type = (int16_t)TSDB_DATA_TYPE_BINARY;
*bytes = 512;
} else {
*type = (int16_t)TSDB_DATA_TYPE_DOUBLE; *type = (int16_t)TSDB_DATA_TYPE_DOUBLE;
*bytes = sizeof(double); *bytes = sizeof(double);
}
*interBytes = sizeof(SPercentileInfo); *interBytes = sizeof(SPercentileInfo);
} else if (functionId == TSDB_FUNC_LEASTSQR) { } else if (functionId == TSDB_FUNC_LEASTSQR) {
*type = TSDB_DATA_TYPE_BINARY; *type = TSDB_DATA_TYPE_BINARY;
...@@ -3096,7 +3101,7 @@ static void percentile_function(SQLFunctionCtx *pCtx) { ...@@ -3096,7 +3101,7 @@ static void percentile_function(SQLFunctionCtx *pCtx) {
} }
static void percentile_finalizer(SQLFunctionCtx *pCtx) { static void percentile_finalizer(SQLFunctionCtx *pCtx) {
double v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey; double v = 0;
SResultRowCellInfo *pResInfo = GET_RES_INFO(pCtx); SResultRowCellInfo *pResInfo = GET_RES_INFO(pCtx);
SPercentileInfo* ppInfo = (SPercentileInfo *) GET_ROWCELL_INTERBUF(pResInfo); SPercentileInfo* ppInfo = (SPercentileInfo *) GET_ROWCELL_INTERBUF(pResInfo);
...@@ -3107,8 +3112,26 @@ static void percentile_finalizer(SQLFunctionCtx *pCtx) { ...@@ -3107,8 +3112,26 @@ static void percentile_finalizer(SQLFunctionCtx *pCtx) {
assert(ppInfo->numOfElems == 0); assert(ppInfo->numOfElems == 0);
setNull(pCtx->pOutput, pCtx->outputType, pCtx->outputBytes); setNull(pCtx->pOutput, pCtx->outputType, pCtx->outputBytes);
} else { } else {
if (pCtx->numOfParams > 1) {
((char *)varDataVal(pCtx->pOutput))[0] = '[';
size_t len = 1;
size_t maxBufLen = 512;
for (int32_t i = 0; i < pCtx->numOfParams; ++i) {
v = pCtx->param[i].nType == TSDB_DATA_TYPE_INT ? pCtx->param[i].i64 : pCtx->param[i].dKey;
if (i == pCtx->numOfParams - 1) {
len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf]", getPercentile(pMemBucket, v));
} else {
len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf, ", getPercentile(pMemBucket, v));
}
}
varDataSetLen(pCtx->pOutput, len);
} else {
v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey;
SET_DOUBLE_VAL((double *)pCtx->pOutput, getPercentile(pMemBucket, v)); SET_DOUBLE_VAL((double *)pCtx->pOutput, getPercentile(pMemBucket, v));
} }
}
tMemBucketDestroy(pMemBucket); tMemBucketDestroy(pMemBucket);
doFinalizer(pCtx); doFinalizer(pCtx);
......
...@@ -9775,7 +9775,13 @@ int32_t createQueryFunc(SQueriedTableInfo* pTableInfo, int32_t numOfOutput, SExp ...@@ -9775,7 +9775,13 @@ int32_t createQueryFunc(SQueriedTableInfo* pTableInfo, int32_t numOfOutput, SExp
} }
} }
int32_t param = (int32_t)pExprs[i].base.param[0].i64; int32_t param;
if (pExprs[i].base.functionId != TSDB_FUNC_PERCT) {
param = (int32_t)pExprs[i].base.param[0].i64;
} else {
param = pExprs[i].base.numOfParams;
}
if (pExprs[i].base.functionId > 0 && pExprs[i].base.functionId != TSDB_FUNC_SCALAR_EXPR && if (pExprs[i].base.functionId > 0 && pExprs[i].base.functionId != TSDB_FUNC_SCALAR_EXPR &&
!isTimeWindowFunction(pExprs[i].base.functionId) && !isTimeWindowFunction(pExprs[i].base.functionId) &&
(type != pExprs[i].base.colType || bytes != pExprs[i].base.colBytes)) { (type != pExprs[i].base.colType || bytes != pExprs[i].base.colBytes)) {
......
...@@ -51,6 +51,19 @@ class TDTestCase: ...@@ -51,6 +51,19 @@ class TDTestCase:
tdSql.error("select percentile(col9 20) from test") tdSql.error("select percentile(col9 20) from test")
tdSql.error("select apercentile(col9 20) from test") tdSql.error("select apercentile(col9 20) from test")
tdSql.error(f'select percentile(col1) from test')
tdSql.error(f'select percentile(col1, -1) from test')
tdSql.error(f'select percentile(col1, 101) from test')
tdSql.error(f'select percentile(col1, col2) from test')
tdSql.error(f'select percentile(1, col1) from test')
tdSql.error(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 101) from test')
tdSql.query(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100) from test')
tdSql.checkData(0, 0, '[1.900000, 2.800000, 3.700000, 4.600000, 5.500000, 6.400000, 7.300000, 8.200000, 9.100000, 10.000000]')
tdSql.query(f'select percentile(col1, 9.9, 19.9, 29.9, 39.9, 49.9, 59.9, 69.9, 79.9, 89.9, 99.9) from test')
tdSql.checkData(0, 0, '[1.891000, 2.791000, 3.691000, 4.591000, 5.491000, 6.391000, 7.291000, 8.191000, 9.091000, 9.991000]')
tdSql.query("select percentile(col1, 0) from test") tdSql.query("select percentile(col1, 0) from test")
tdSql.checkData(0, 0, np.percentile(intData, 0)) tdSql.checkData(0, 0, np.percentile(intData, 0))
tdSql.query("select apercentile(col1, 0) from test") tdSql.query("select apercentile(col1, 0) from test")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册