Merge pull request #20124 from taosdata/feat/TS-2671-2.6

feat(query): optimize percentile function performance

Merge pull request #20124 from taosdata/feat/TS-2671-2.6
feat(query): optimize percentile function performance
584af372 · dapan1121 · GitHub · 9199727f · 02cd3cc5 · 584af372
9 changed file
--- a/docs/en/12-taos-sql/10-function.md
+++ b/docs/en/12-taos-sql/10-function.md
@@ -569,18 +569,23 @@ Query OK, 2 row(s) in set (0.000793s)
 ### PERCENTILE

 ```
-SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause];
+SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
 ```

 **Description**: The value whose rank in a specific column matches the specified percentage. If such a value matching the specified percentage doesn't exist in the column, an interpolation value will be returned.

-**Return value type**: Double precision floating point
+**Return value type**: This function takes 2 minumum and 11 maximum parameters, and it can simultaneously return 10 percentiles at most. If 2 parameters are given, a single percentile is returned and the value type is DOUBLE.
+                       If more than 2 parameters are given, the return value type is a VARCHAR string, the format of which is a JSON ARRAY containing all return values.

 **Applicable column types**: Data types except for timestamp, binary, nchar and bool

 **Applicable table types**: table

-**More explanations**: _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX.
+**More explanations**:
+
+- _P_ is in range [0,100], when _P_ is 0, the result is same as using function MIN; when _P_ is 100, the result is same as function MAX.
+- When calculating multiple percentiles of a specific column, a single PERCENTILE function with multiple parameters is adviced, as this can largely reduce the query response time.
+  For example, using SELECT percentile(col, 90, 95, 99) FROM table will perform better than SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table.

 **Examples**:


--- a/docs/zh/12-taos-sql/10-function.md
+++ b/docs/zh/12-taos-sql/10-function.md
@@ -564,18 +564,22 @@ Query OK, 2 row(s) in set (0.000793s)
 ### PERCENTILE

 ```
-SELECT PERCENTILE(field_name, P) FROM { tb_name } [WHERE clause];
+SELECT PERCENTILE(field_name, P [, P1] ...) FROM { tb_name } [WHERE clause];
 ```

 **功能说明**：统计表中某列的值百分比分位数。

-**返回数据类型**： 双精度浮点数 Double。
+**返回数据类型**：该函数最小参数个数为 2 个，最大参数个数为 11 个。可以最多同时返回 10 个百分比分位数。当参数个数为 2 时， 返回一个分位数，类型为DOUBLE，当参数个数大于 2 时，返回类型为VARCHAR, 格式为包含多个返回值的JSON数组。

 **应用字段**：不能应用在 timestamp、binary、nchar、bool 类型字段。

 **适用于**：表。

-**使用说明**：*P*值取值范围 0≤*P*≤100，为 0 的时候等同于 MIN，为 100 的时候等同于 MAX。
+**使用说明**：
+
+- *P*值取值范围 0≤*P*≤100，为 0 的时候等同于 MIN，为 100 的时候等同于 MAX。
+- 同时计算针对同一列的多个分位数时，建议使用一个PERCENTILE函数和多个参数的方式，能很大程度上降低查询的响应时间。
+  比如，使用查询SELECT percentile(col, 90, 95, 99) FROM table, 性能会优于SELECT percentile(col, 90), percentile(col, 95), percentile(col, 99) from table。

 **示例**：


--- a/src/client/src/tscSQLParser.c
+++ b/src/client/src/tscSQLParser.c
@@ -2684,7 +2684,7 @@ static int32_t setExprInfoForFunctions(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, SS
    pExpr->base.param[0].i64 = TSDB_ORDER_DESC;
    pExpr->base.param[0].nType = TSDB_DATA_TYPE_INT;
  }
-  
+
  // for all queries, the timestamp column needs to be loaded
  SSchema s = {.colId = PRIMARYKEY_TIMESTAMP_COL_INDEX, .bytes = TSDB_KEYSIZE, .type = TSDB_DATA_TYPE_TIMESTAMP,};
  tscColumnListInsert(pQueryInfo->colList, PRIMARYKEY_TIMESTAMP_COL_INDEX, pExpr->base.uid, &s);
@@ -3289,9 +3289,12 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      } else if (functionId == TSDB_FUNC_APERCT || functionId == TSDB_FUNC_TAIL) {
        size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
        if (cnt != 2 && cnt != 3) valid = false;
+      } else if (functionId == TSDB_FUNC_PERCT) {
+        size_t cnt = taosArrayGetSize(pItem->pNode->Expr.paramList);
+        if (cnt < 2 || cnt > 11) valid = false;
      } else if (functionId == TSDB_FUNC_UNIQUE) {
        if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 1) valid = false;
-      }else {
+      } else {
        if (taosArrayGetSize(pItem->pNode->Expr.paramList) != 2) valid = false;
      }
      if (!valid) {
@@ -3332,7 +3335,7 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      }

      tVariant* pVariant = NULL;
-      if (functionId != TSDB_FUNC_UNIQUE) {
+      if (functionId != TSDB_FUNC_UNIQUE && functionId != TSDB_FUNC_PERCT) {
        // 3. valid the parameters
        if (pParamElem[1].pNode->tokenId == TK_ID) {
          return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg2);
@@ -3348,7 +3351,31 @@ int32_t addExprAndResultField(SSqlCmd* pCmd, SQueryInfo* pQueryInfo, int32_t col
      char val[8] = {0};

      SExprInfo* pExpr = NULL;
-      if (functionId == TSDB_FUNC_PERCT || functionId == TSDB_FUNC_APERCT) {
+      if (functionId == TSDB_FUNC_PERCT) {
+        int32_t numOfParams = (int32_t)taosArrayGetSize(pItem->pNode->Expr.paramList);
+        getResultDataInfo(pSchema->type, pSchema->bytes, functionId, numOfParams - 1, &resultType, &resultSize, &interResult, 0,
+                          false, pUdfInfo);
+        pExpr = tscExprAppend(pQueryInfo, functionId, &idx, resultType, resultSize, getNewResColId(pCmd),
+                              interResult, false);
+
+        for (int32_t i = 1; i < numOfParams; ++i) {
+          pVariant = &pParamElem[i].pNode->value;
+          if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
+            return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
+          }
+          tVariantDump(pVariant, val, TSDB_DATA_TYPE_DOUBLE, true);
+
+          double dp = GET_DOUBLE_VAL(val);
+          if (dp < 0 || dp > TOP_BOTTOM_QUERY_LIMIT) {
+            return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);
+          }
+
+          tscExprAddParams(&pExpr->base, val, TSDB_DATA_TYPE_DOUBLE, sizeof(double));
+        }
+
+        tscInsertPrimaryTsSourceColumn(pQueryInfo, pTableMetaInfo->pTableMeta->id.uid);
+        colIndex += 1;  // the first column is ts
+      } else if (functionId == TSDB_FUNC_APERCT) {
        // param1 double
        if (pVariant->nType != TSDB_DATA_TYPE_DOUBLE && pVariant->nType != TSDB_DATA_TYPE_BIGINT) {
          return invalidOperationMsg(tscGetErrorMsgPayload(pCmd), msg5);

--- a/src/client/src/tscUtil.c
+++ b/src/client/src/tscUtil.c
@@ -2780,7 +2780,7 @@ void tscExprAddParams(SSqlExpr* pExpr, char* argument, int32_t type, int32_t byt
  tVariantCreateFromBinary(&pExpr->param[pExpr->numOfParams], argument, bytes, type);
  pExpr->numOfParams += 1;

-  assert(pExpr->numOfParams <= 3);
+  assert(pExpr->numOfParams <= 10);
 }

 SExprInfo* tscExprGet(SQueryInfo* pQueryInfo, int32_t idx) {

--- a/src/common/inc/tname.h
+++ b/src/common/inc/tname.h
@@ -59,7 +59,7 @@ typedef struct SSqlExpr {
                            // pQueryAttr->interBytesForGlobal

  int16_t   numOfParams;    // argument value of each function
-  tVariant  param[3];       // parameters are not more than 3
+  tVariant  param[10];      // parameters are not more than 10
  int32_t   offset;         // sub result column value of arithmetic expression.
  int16_t   resColId;       // result column id


--- a/src/query/inc/qAggMain.h
+++ b/src/query/inc/qAggMain.h
@@ -210,7 +210,7 @@ typedef struct SQLFunctionCtx {
  int64_t      startTs;       // timestamp range of current query when function is executed on a specific data block
  int64_t      endTs;
  int32_t      numOfParams;
-  tVariant     param[4];      // input parameter, e.g., top(k, 20), the number of results for top query is kept in param
+  tVariant     param[10];     // input parameter, e.g., top(k, 20), the number of results for top query is kept in param
  int64_t     *ptsList;       // corresponding timestamp array list
  void        *ptsOutputBuf;  // corresponding output buffer for timestamp of each result, e.g., top/bottom*/
  SQLPreAggVal preAggVals;

--- a/src/query/src/qAggMain.c
+++ b/src/query/src/qAggMain.c
@@ -700,8 +700,13 @@ int32_t getResultDataInfo(int32_t dataType, int32_t dataBytes, int32_t functionI
    *bytes = sizeof(double);
    *interBytes = sizeof(SSpreadInfo);
  } else if (functionId == TSDB_FUNC_PERCT) {
-    *type = (int16_t)TSDB_DATA_TYPE_DOUBLE;
-    *bytes = sizeof(double);
+    if (param > 1) {
+      *type = (int16_t)TSDB_DATA_TYPE_BINARY;
+      *bytes = 512;
+    } else {
+      *type = (int16_t)TSDB_DATA_TYPE_DOUBLE;
+      *bytes = sizeof(double);
+    }
    *interBytes = sizeof(SPercentileInfo);
  } else if (functionId == TSDB_FUNC_LEASTSQR) {
    *type = TSDB_DATA_TYPE_BINARY;
@@ -3096,7 +3101,7 @@ static void percentile_function(SQLFunctionCtx *pCtx) {
 }

 static void percentile_finalizer(SQLFunctionCtx *pCtx) {
-  double v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey;
+  double v = 0;

  SResultRowCellInfo *pResInfo = GET_RES_INFO(pCtx);
  SPercentileInfo* ppInfo = (SPercentileInfo *) GET_ROWCELL_INTERBUF(pResInfo);
@@ -3107,7 +3112,25 @@ static void percentile_finalizer(SQLFunctionCtx *pCtx) {
      assert(ppInfo->numOfElems == 0);
    setNull(pCtx->pOutput, pCtx->outputType, pCtx->outputBytes);
  } else {
-    SET_DOUBLE_VAL((double *)pCtx->pOutput, getPercentile(pMemBucket, v));
+    if (pCtx->numOfParams > 1) {
+      ((char *)varDataVal(pCtx->pOutput))[0] = '[';
+      size_t len = 1;
+      size_t maxBufLen = 512;
+
+      for (int32_t i = 0; i < pCtx->numOfParams; ++i) {
+        v = pCtx->param[i].nType == TSDB_DATA_TYPE_INT ? pCtx->param[i].i64 : pCtx->param[i].dKey;
+
+        if (i == pCtx->numOfParams - 1) {
+          len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf]", getPercentile(pMemBucket, v));
+        } else {
+          len += snprintf((char *)varDataVal(pCtx->pOutput) + len, maxBufLen - len, "%lf, ", getPercentile(pMemBucket, v));
+        }
+      }
+      varDataSetLen(pCtx->pOutput, len);
+    } else {
+      v = pCtx->param[0].nType == TSDB_DATA_TYPE_INT ? pCtx->param[0].i64 : pCtx->param[0].dKey;
+      SET_DOUBLE_VAL((double *)pCtx->pOutput, getPercentile(pMemBucket, v));
+    }
  }

  tMemBucketDestroy(pMemBucket);

--- a/src/query/src/qExecutor.c
+++ b/src/query/src/qExecutor.c
@@ -9775,7 +9775,13 @@ int32_t createQueryFunc(SQueriedTableInfo* pTableInfo, int32_t numOfOutput, SExp
      }
    }

-    int32_t param = (int32_t)pExprs[i].base.param[0].i64;
+    int32_t param;
+    if (pExprs[i].base.functionId != TSDB_FUNC_PERCT) {
+      param = (int32_t)pExprs[i].base.param[0].i64;
+    } else {
+      param = pExprs[i].base.numOfParams;
+    }
+
    if (pExprs[i].base.functionId > 0 && pExprs[i].base.functionId != TSDB_FUNC_SCALAR_EXPR &&
        !isTimeWindowFunction(pExprs[i].base.functionId) &&
        (type != pExprs[i].base.colType || bytes != pExprs[i].base.colBytes)) {

--- a/tests/pytest/functions/function_percentile.py
+++ b/tests/pytest/functions/function_percentile.py
@@ -26,32 +26,45 @@ class TDTestCase:

        self.rowNum = 10
        self.ts = 1537146000000
-        
+
    def run(self):
        tdSql.prepare()

-        intData = []        
+        intData = []
        floatData = []

-        tdSql.execute('''create table test(ts timestamp, col1 tinyint, col2 smallint, col3 int, col4 bigint, col5 float, col6 double, 
+        tdSql.execute('''create table test(ts timestamp, col1 tinyint, col2 smallint, col3 int, col4 bigint, col5 float, col6 double,
                    col7 bool, col8 binary(20), col9 nchar(20), col11 tinyint unsigned, col12 smallint unsigned, col13 int unsigned, col14 bigint unsigned)''')
        for i in range(self.rowNum):
-            tdSql.execute("insert into test values(%d, %d, %d, %d, %d, %f, %f, %d, 'taosdata%d', '涛思数据%d', %d, %d, %d, %d)" 
+            tdSql.execute("insert into test values(%d, %d, %d, %d, %d, %f, %f, %d, 'taosdata%d', '涛思数据%d', %d, %d, %d, %d)"
                        % (self.ts + i, i + 1, i + 1, i + 1, i + 1, i + 0.1, i + 0.1, i % 2, i + 1, i + 1, i + 1, i + 1, i + 1, i + 1))
-            intData.append(i + 1)            
-            floatData.append(i + 0.1)                        
+            intData.append(i + 1)
+            floatData.append(i + 0.1)

-        # percentile verifacation 
+        # percentile verifacation
        tdSql.error("select percentile(ts 20) from test")
        tdSql.error("select apercentile(ts 20) from test")
        tdSql.error("select percentile(col7 20) from test")
        tdSql.error("select apercentile(col7 20) from test")
-        tdSql.error("select percentile(col8 20) from test")        
+        tdSql.error("select percentile(col8 20) from test")
        tdSql.error("select apercentile(col8 20) from test")
        tdSql.error("select percentile(col9 20) from test")
-        tdSql.error("select apercentile(col9 20) from test")        
+        tdSql.error("select apercentile(col9 20) from test")
+
+        tdSql.error(f'select percentile(col1) from test')
+        tdSql.error(f'select percentile(col1, -1) from test')
+        tdSql.error(f'select percentile(col1, 101) from test')
+        tdSql.error(f'select percentile(col1, col2) from test')
+        tdSql.error(f'select percentile(1, col1) from test')
+        tdSql.error(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 101) from test')
+
+        tdSql.query(f'select percentile(col1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100) from test')
+        tdSql.checkData(0, 0, '[1.900000, 2.800000, 3.700000, 4.600000, 5.500000, 6.400000, 7.300000, 8.200000, 9.100000, 10.000000]')
+
+        tdSql.query(f'select percentile(col1, 9.9, 19.9, 29.9, 39.9, 49.9, 59.9, 69.9, 79.9, 89.9, 99.9) from test')
+        tdSql.checkData(0, 0, '[1.891000, 2.791000, 3.691000, 4.591000, 5.491000, 6.391000, 7.291000, 8.191000, 9.091000, 9.991000]')

-        tdSql.query("select percentile(col1, 0) from test")        
+        tdSql.query("select percentile(col1, 0) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 0))
        tdSql.query("select apercentile(col1, 0) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
@@ -62,7 +75,7 @@ class TDTestCase:
        tdSql.query("select percentile(col1, 100) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col1, 100) from test")
-        print("apercentile result: %s" % tdSql.getData(0, 0))  
+        print("apercentile result: %s" % tdSql.getData(0, 0))

        tdSql.query("select percentile(col2, 0) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 0))
@@ -73,7 +86,7 @@ class TDTestCase:
        tdSql.query("select apercentile(col2, 50) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col2, 100) from test")
-        tdSql.checkData(0, 0, np.percentile(intData, 100)) 
+        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col2, 100) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))

@@ -99,11 +112,11 @@ class TDTestCase:
        tdSql.query("select apercentile(col4, 50) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col4, 100) from test")
-        tdSql.checkData(0, 0, np.percentile(intData, 100)) 
+        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col4, 100) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))

-        tdSql.query("select percentile(col11, 0) from test")        
+        tdSql.query("select percentile(col11, 0) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 0))
        tdSql.query("select apercentile(col11, 0) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
@@ -114,7 +127,7 @@ class TDTestCase:
        tdSql.query("select percentile(col11, 100) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col11, 100) from test")
-        print("apercentile result: %s" % tdSql.getData(0, 0))  
+        print("apercentile result: %s" % tdSql.getData(0, 0))

        tdSql.query("select percentile(col12, 0) from test")
        tdSql.checkData(0, 0, np.percentile(intData, 0))
@@ -125,7 +138,7 @@ class TDTestCase:
        tdSql.query("select apercentile(col12, 50) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col12, 100) from test")
-        tdSql.checkData(0, 0, np.percentile(intData, 100)) 
+        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col12, 100) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))

@@ -151,11 +164,11 @@ class TDTestCase:
        tdSql.query("select apercentile(col14, 50) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col14, 100) from test")
-        tdSql.checkData(0, 0, np.percentile(intData, 100)) 
+        tdSql.checkData(0, 0, np.percentile(intData, 100))
        tdSql.query("select apercentile(col14, 100) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))

-        tdSql.query("select percentile(col5, 0) from test")        
+        tdSql.query("select percentile(col5, 0) from test")
        print("query result: %s" % tdSql.getData(0, 0))
        print("array result: %s" % np.percentile(floatData, 0))
        tdSql.query("select apercentile(col5, 0) from test")
@@ -164,12 +177,12 @@ class TDTestCase:
        print("query result: %s" % tdSql.getData(0, 0))
        print("array result: %s" % np.percentile(floatData, 50))
        tdSql.query("select apercentile(col5, 50) from test")
-        print("apercentile result: %s" % tdSql.getData(0, 0))        
+        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col5, 100) from test")
        print("query result: %s" % tdSql.getData(0, 0))
-        print("array result: %s" % np.percentile(floatData, 100)) 
+        print("array result: %s" % np.percentile(floatData, 100))
        tdSql.query("select apercentile(col5, 100) from test")
-        print("apercentile result: %s" % tdSql.getData(0, 0))     
+        print("apercentile result: %s" % tdSql.getData(0, 0))

        tdSql.query("select percentile(col6, 0) from test")
        tdSql.checkData(0, 0, np.percentile(floatData, 0))
@@ -180,17 +193,17 @@ class TDTestCase:
        tdSql.query("select apercentile(col6, 50) from test")
        print("apercentile result: %s" % tdSql.getData(0, 0))
        tdSql.query("select percentile(col6, 100) from test")
-        tdSql.checkData(0, 0, np.percentile(floatData, 100)) 
+        tdSql.checkData(0, 0, np.percentile(floatData, 100))
        tdSql.query("select apercentile(col6, 100) from test")
-        print("apercentile result: %s" % tdSql.getData(0, 0)) 
+        print("apercentile result: %s" % tdSql.getData(0, 0))

        tdSql.execute("create table meters (ts timestamp, voltage int) tags(loc nchar(20))")
        tdSql.execute("create table t0 using meters tags('beijing')")
        tdSql.execute("create table t1 using meters tags('shanghai')")
        for i in range(self.rowNum):
-            tdSql.execute("insert into t0 values(%d, %d)" % (self.ts + i, i + 1))            
-            tdSql.execute("insert into t1 values(%d, %d)" % (self.ts + i, i + 1))            
-        
+            tdSql.execute("insert into t0 values(%d, %d)" % (self.ts + i, i + 1))
+            tdSql.execute("insert into t1 values(%d, %d)" % (self.ts + i, i + 1))
+
        tdSql.error("select percentile(voltage, 20) from meters")
        tdSql.query("select apercentile(voltage, 20) from meters")
        print("apercentile result: %s" % tdSql.getData(0, 0))
@@ -204,7 +217,7 @@ class TDTestCase:
        tdSql.checkData(0, 0, -100.00)


-        
+
    def stop(self):
        tdSql.close()
        tdLog.success("%s successfully executed" % __file__)