Merge pull request #20124 from taosdata/feat/TS-2671-2.6

feat(query): optimize percentile function performance

Merge pull request #20124 from taosdata/feat/TS-2671-2.6
feat(query): optimize percentile function performance
584af372 · dapan1121 · GitHub · 9199727f · 02cd3cc5 · 584af372
9 changed file
--- a/docs/en/12-taos-sql/10-function.md
+++ b/docs/en/12-taos-sql/10-function.md
@@ -569,18 +569,23 @@ Query OK, 2 row(s) in set (0.000793s)
 ### PERCENTILE
 ```
-SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause];
+SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
 ```
 **Description**: The value whose rank in a specific column matches the specified percentage. If such a value matching the specified percentage doesn't exist in the column, an interpolation value will be returned.
-**Return value type**: Double precision floating point
+**Return value type**: This function takes 2 minumum and 11 maximum parameters, and it can simultaneously return 10 percentiles at most. If 2 parameters are given, a single percentile is returned and the value type is DOUBLE.
+                       If more than 2 parameters are given, the return value type is a VARCHAR string, the format of which is a JSON ARRAY containing all return values.
 **Applicable column types**: Data types except for timestamp, binary, nchar and bool
 **Applicable table types**: table
-**More explanations**: _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX.
+**More explanations**:
+- _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX.
+- When calculating multiple percentiles of a specific column, a single PERCENTILE function with multiple parameters is adviced, as this can largely reduce the query response time.
+  For example, using SELECT percentile(col, 90, 95, 99) FROM table will perform better than SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table.
 **Examples**:

--- a/docs/zh/12-taos-sql/10-function.md
+++ b/docs/zh/12-taos-sql/10-function.md
@@ -564,18 +564,22 @@ Query OK, 2 row(s) in set (0.000793s)
 ### PERCENTILE
 ```
-SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause];
+SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
 ```
 **功能说明**：统计表中某列的值百分比分位数。
-**返回数据类型**： 双精度浮点数 Double。
+**返回数据类型**：该函数最小参数个数为 2 个，最大参数个数为 11 个。可以最多同时返回 10 个百分比分位数。当参数个数为 2 时， 返回一个分位数，类型为DOUBLE，当参数个数大于 2 时，返回类型为VARCHAR, 格式为包含多个返回值的JSON数组。
 **应用字段**：不能应用在 timestamp、binary、nchar、bool 类型字段。
 **适用于**：表。
-**使用说明**：*P*值取值范围 0≤*P*≤100，为 0 的时候等同于 MIN，为 100 的时候等同于 MAX。
+**使用说明**：
+- *P*值取值范围 0≤*P*≤100，为 0 的时候等同于 MIN，为 100 的时候等同于 MAX。
+- 同时计算针对同一列的多个分位数时，建议使用一个PERCENTILE函数和多个参数的方式，能很大程度上降低查询的响应时间。
+  比如，使用查询SELECT percentile(col, 90, 95, 99) FROM table, 性能会优于SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table。
 **示例**：

--- a/src/client/src/tscSQLParser.c
+++ b/src/client/src/tscSQLParser.c
@@ -3289,9 +3289,12 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      } else if (functionId == TSDB_FUNC_APERCT || functionId == TSDB_FUNC_TAIL) {
        size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
        if (cnt != 2 && cnt != 3) valid = false;
+      } else if (functionId == TSDB_FUNC_PERCT) {
+        size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
+        if (cnt < 2 || cnt > 11) valid = false;
      } else if (functionId == TSDB_FUNC_UNIQUE) {
        if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 1) valid = false;
-      }else {
+      } else {
        if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 2) valid = false;
      }
      if (!valid) {
@@ -3332,7 +3335,7 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      }
      tVariant* pVariant = NULL;
-      if (functionId != TSDB_FUNC_UNIQUE) {
+      if (functionId != TSDB_FUNC_UNIQUE && functionId != TSDB_FUNC_PERCT) {
        // 3. valid the parameters
        if (pParamElem[1].pNode->tokenId == TK_ID) {
          return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg2);
@@ -3348,7 +3351,31 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      char val[8] = {0};
      SExprInfo* pExpr = NULL;
-      if (functionId == TSDB_FUNC_PERCT || functionId == TSDB_FUNC_APERCT) {
+      if (functionId == TSDB_FUNC_PERCT) {
+        int32_t numOfParams = (int32_t)taosArrayGetSize(pItem->pNode->Expr.paramList);
+        getResultDataInfo(pSchema->type, pSchema->bytes, functionId, numOfParams - 1, &resultType, &resultSize, &interResult, 0,
+                          false, pUdfInfo);
+        pExpr = tscExprAppend(pQueryInfo, functionId, &idx, resultType, resultSize, getNewResColId(pCmd),
+                              interResult, false);
+        for (int32_t i = 1; i < numOfParams; ++i) {
+          pVariant = &pParamElem[i].pNode->value;
+          if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
+            return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
+          }
+          tVariantDump(pVariant, val, TSDB_DATA_TYPE_DOUBLE, true);
+          double dp = GET_DOUBLE_VAL(val);
+          if (dp < 0 || dp > TOP_BOTTOM_QUERY_LIMIT) {
+            return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
+          }
+          tscExprAddParams(&pExpr->base, val, TSDB_DATA_TYPE_DOUBLE, sizeof(double));
+        }
+        tscInsertPrimaryTsSourceColumn(pQueryInfo, pTableMetaInfo->pTableMeta->id.uid);
+        colIndex += 1;  // the first column is ts
+      } else if (functionId == TSDB_FUNC_APERCT) {
        // param1 double
        if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
          return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);

--- a/src/client/src/tscUtil.c
+++ b/src/client/src/tscUtil.c
@@ -2780,7 +2780,7 @@ void tscExprAddParams(SSqlExpr* pExpr, char* argument, int32_t type, int32_t byt
  tVariantCreateFromBinary(&pExpr->param[pExpr->numOfParams], argument, bytes, type);
  pExpr->numOfParams += 1;
-  assert(pExpr->numOfParams <= 3);
+  assert(pExpr->numOfParams <= 10);
 }
 SExprInfo* tscExprGet(SQueryInfo* pQueryInfo, int32_t idx) {

--- a/src/common/inc/tname.h
+++ b/src/common/inc/tname.h
@@ -59,7 +59,7 @@ typedef struct SSqlExpr {
                            // pQueryAttr->interBytesForGlobal
  int16_t   numOfParams;    // argument value of each function
-  tVariant  param[3];       // parameters are not more than 3
+  tVariant  param[10];      // parameters are not more than 10
  int32_t   offset;         // sub result column value of arithmetic expression.
  int16_t   resColId;       // result column id

--- a/src/query/inc/qAggMain.h
+++ b/src/query/inc/qAggMain.h
@@ -210,7 +210,7 @@ typedef struct SQLFunctionCtx {
  int64_t      startTs;       // timestamp range of current query when function is executed on a specific data block
  int64_t      endTs;
  int32_t      numOfParams;
-  tVariant     param[4];      // input parameter, e.g., top(k, 20), the number of results for top query is kept in param
+  tVariant     param[10];     // input parameter, e.g., top(k, 20), the number of results for top query is kept in param
  int64_t     *ptsList;       // corresponding timestamp array list
  void        *ptsOutputBuf;  // corresponding output buffer for timestamp of each result, e.g., top/bottom*/
  SQLPreAggVal preAggVals;

--- a/src/query/src/qAggMain.c
+++ b/src/query/src/qAggMain.c
@@ -700,8 +700,13 @@ int32_t getResultDataInfo(int32_t dataType, int32_t dataBytes, int32_t functionI
    *bytes = sizeof(double);
    *interBytes = sizeof(SSpreadInfo);
  } else if (functionId == TSDB_FUNC_PERCT) {
+    if (param > 1) {
+      *type = (int16_t)TSDB_DATA_TYPE_BINARY;
+      *bytes = 512;
+    } else {
      *type = (int16_t)TSDB_DATA_TYPE_DOUBLE;
      *bytes = sizeof(double);
+    }
    *interBytes = sizeof(SPercentileInfo);
  } else if (functionId == TSDB_FUNC_LEASTSQR) {
    *type = TSDB_DATA_TYPE_BINARY;
@@ -3096,7 +3101,7 @@ static void percentile_function(SQLFunctionCtx *pCtx) {
 }
 static void percentile_finalizer(SQLFunctionCtx *pCtx) {
-  double v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey;
+  double v = 0;
  SResultRowCellInfo *pResInfo = GET_RES_INFO(pCtx);
  SPercentileInfo* ppInfo = (SPercentileInfo *) GET_ROWCELL_INTERBUF(pResInfo);
@@ -3107,8 +3112,26 @@ static void percentile_finalizer(SQLFunctionCtx *pCtx) {
      assert(ppInfo->numOfElems == 0);
    setNull(pCtx->pOutput, pCtx->outputType, pCtx->outputBytes);
  } else {
+    if (pCtx->numOfParams > 1) {
+      ((char *)varDataVal(pCtx->pOutput))[0] = '[';
+      size_t len = 1;
+      size_t maxBufLen = 512;
+      for (int32_t i = 0; i < pCtx->numOfParams; ++i) {
+        v = pCtx->param[i].nType == TSDB_DATA_TYPE_INT ? pCtx->param[i].i64 : pCtx->param[i].dKey;
+        if (i == pCtx->numOfParams - 1) {
+          len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf]", getPercentile(pMemBucket, v));
+        } else {
+          len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf, ", getPercentile(pMemBucket, v));
+        }
+      }
+      varDataSetLen(pCtx->pOutput, len);
+    } else {
+      v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey;
      SET_DOUBLE_VAL((double *)pCtx->pOutput, getPercentile(pMemBucket, v));
    }
+  }
  tMemBucketDestroy(pMemBucket);
  doFinalizer(pCtx);

--- a/src/query/src/qExecutor.c
+++ b/src/query/src/qExecutor.c
@@ -9775,7 +9775,13 @@ int32_t createQueryFunc(SQueriedTableInfo* pTableInfo, int32_t numOfOutput, SExp
      }
    }
-    int32_t param = (int32_t)pExprs[i].base.param[0].i64;
+    int32_t param;
+    if (pExprs[i].base.functionId != TSDB_FUNC_PERCT) {
+      param = (int32_t)pExprs[i].base.param[0].i64;
+    } else {
+      param = pExprs[i].base.numOfParams;
+    }
    if (pExprs[i].base.functionId > 0 && pExprs[i].base.functionId != TSDB_FUNC_SCALAR_EXPR &&
        !isTimeWindowFunction(pExprs[i].base.functionId) &&
        (type != pExprs[i].base.colType || bytes != pExprs[i].base.colBytes)) {

--- a/tests/pytest/functions/function_percentile.py
+++ b/tests/pytest/functions/function_percentile.py
@@ -51,6 +51,19 @@ class TDTestCase:
        tdSql.error("select percentile(col9 20) from test")
        tdSql.error("select apercentile(col9 20) from test")
+        tdSql.error(f'select percentile(col1) from test')
+        tdSql.error(f'select percentile(col1, -1) from test')
+        tdSql.error(f'select percentile(col1, 101) from test')
+        tdSql.error(f'select percentile(col1, col2) from test')
+        tdSql.error(f'select percentile(1, col1) from test')
+        tdSql.error(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 101) from test')
+        tdSql.query(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100) from test')
+        tdSql.checkData(0, 0, '[1.900000, 2.800000, 3.700000, 4.600000, 5.500000, 6.400000, 7.300000, 8.200000, 9.100000, 10.000000]')
+        tdSql.query(f'select percentile(col1, 9.9, 19.9, 29.9, 39.9, 49.9, 59.9, 69.9, 79.9, 89.9, 99.9) from test')
+        tdSql.checkData(0, 0, '[1.891000, 2.791000, 3.691000, 4.591000, 5.491000, 6.391000, 7.291000, 8.191000, 9.091000, 9.991000]')
        tdSql.query("select percentile(col1, 0) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 0))
        tdSql.query("select apercentile(col1, 0) from test")