refactor(query): add simd support for minmax query.

d7143926 · Haojun Liao · e0e55b2d · d7143926 · d7143926
Showing with 268 addition and 47 deletion

source/libs/function/src/detail/tavgfunction.c source/libs/function/src/detail/tavgfunction.c +0 -33

source/libs/function/src/detail/tminmax.c source/libs/function/src/detail/tminmax.c +268 -14

未找到文件。
--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@@ -272,39 +272,6 @@ static void i64VectorSumAVX2(const int64_t* plist, int32_t numOfRows, SAvgRes* p
 #endif
 }
-static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) {
-  int32_t numOfElems = 0;
-  float*  plist = (float*)pCol->pData;
-  const int32_t THRESHOLD_SIZE = 8;
-  if (pCol->hasNull || pInput->numOfRows <= THRESHOLD_SIZE) {
-    for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
-      if (colDataIsNull_f(pCol->nullbitmap, i)) {
-        continue;
-      }
-      numOfElems += 1;
-      pRes->count += 1;
-      pRes->sum.dsum += plist[i];
-    }
-  } else { // no null values exist
-    numOfElems = pInput->numOfRows;
-    pRes->count += pInput->numOfRows;
-    // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
-    if (tsAVXEnable && tsSIMDEnable) {
-      floatVectorSumAVX(plist, pInput->numOfRows, pRes);
-    } else {
-      for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
-        pRes->sum.dsum += plist[i];
-      }
-    }
-  }
-  return numOfElems;
-}
 int32_t getAvgInfoSize() { return (int32_t)sizeof(SAvgRes); }
 bool getAvgFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) {

--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@@ -30,7 +30,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is
 #if __AVX2__
  __m256i next;
-  __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
+  __m256i initialVal = _mm256_lddqu_si256((__m256i*)p);
  p += width;
  if (!isMinFunc) {  // max function
@@ -40,7 +40,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is
      p += width;
    }
-    // let sum up the final results
+    // let compare  the final results
    const int32_t* q = (const int32_t*)&initialVal;
    v = TMAX(q[0], q[1]);
    for (int32_t k = 1; k < width; ++k) {
@@ -155,7 +155,7 @@ static int8_t i8VectorCmpAVX2(const int8_t* pData, int32_t numOfRows, bool isMin
 #if __AVX2__
  __m256i next;
-  __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
+  __m256i initialVal = _mm256_lddqu_si256((__m256i*)p);
  p += width;
  if (!isMinFunc) {  // max function
@@ -218,7 +218,7 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is
 #if __AVX2__
  __m256i next;
-  __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
+  __m256i initialVal = _mm256_lddqu_si256((__m256i*)p);
  p += width;
  if (!isMinFunc) {  // max function
@@ -271,6 +271,179 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is
  return v;
 }
+static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
+                             SMinmaxResInfo* pBuf, bool isMinFunc) {
+  int8_t* pData = (int8_t*)pCol->pData;
+  int8_t* val = (int8_t*)&pBuf->v;
+  int32_t numOfElems = 0;
+  if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) {
+    int32_t i = start;
+    while (i < (start + numOfRows)) {
+      if (!colDataIsNull_f(pCol->nullbitmap, i)) {
+        break;
+      }
+      i += 1;
+    }
+    if ((i < (start + numOfRows)) && (!pBuf->assign)) {
+      *val = pData[i];
+      if (pCtx->subsidiaries.num > 0) {
+        pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL);
+      }
+      pBuf->assign = true;
+      numOfElems += 1;
+    }
+    if (isMinFunc) {  // min
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        if (*val > pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    } else {  // max function
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        // ignore the equivalent data value
+        // NOTE: An faster version to avoid one additional comparison with FPU.
+        if (*val < pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    }
+  } else {  // not has null value
+    // AVX2 version to speedup the loop
+    if (tsAVX2Enable && tsSIMDEnable) {
+      *val = i8VectorCmpAVX2(pData, numOfRows, isMinFunc);
+    } else {
+      if (!pBuf->assign) {
+        *val = pData[0];
+        pBuf->assign = true;
+      }
+      if (isMinFunc) {  // min
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val > pData[i]) {
+            *val = pData[i];
+          }
+        }
+      } else {  // max
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val < pData[i]) {
+            *val = pData[i];
+          }
+        }
+      }
+    }
+    numOfElems = numOfRows;
+  }
+  return numOfElems;
+}
+static int32_t handleInt16Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
+                             SMinmaxResInfo* pBuf, bool isMinFunc) {
+  int16_t* pData = (int16_t*)pCol->pData;
+  int16_t* val = (int16_t*)&pBuf->v;
+  int32_t numOfElems = 0;
+  if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) {
+    int32_t i = start;
+    while (i < (start + numOfRows)) {
+      if (!colDataIsNull_f(pCol->nullbitmap, i)) {
+        break;
+      }
+      i += 1;
+    }
+    if ((i < (start + numOfRows)) && (!pBuf->assign)) {
+      *val = pData[i];
+      if (pCtx->subsidiaries.num > 0) {
+        pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL);
+      }
+      pBuf->assign = true;
+      numOfElems += 1;
+    }
+    if (isMinFunc) {  // min
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        if (*val > pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    } else {  // max function
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        // ignore the equivalent data value
+        // NOTE: An faster version to avoid one additional comparison with FPU.
+        if (*val < pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    }
+  } else {  // not has null value
+    // AVX2 version to speedup the loop
+    if (tsAVX2Enable && tsSIMDEnable) {
+      *val = i16VectorCmpAVX2(pData, numOfRows, isMinFunc);
+    } else {
+      if (!pBuf->assign) {
+        *val = pData[0];
+        pBuf->assign = true;
+      }
+      if (isMinFunc) {  // min
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val > pData[i]) {
+            *val = pData[i];
+          }
+        }
+      } else {  // max
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val < pData[i]) {
+            *val = pData[i];
+          }
+        }
+      }
+    }
+    numOfElems = numOfRows;
+  }
+  return numOfElems;
+}
 static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
                              SMinmaxResInfo* pBuf, bool isMinFunc) {
@@ -359,6 +532,87 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO
  return numOfElems;
 }
+static int32_t handleInt64Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
+                              SMinmaxResInfo* pBuf, bool isMinFunc) {
+  int32_t* pData = (int32_t*)pCol->pData;
+  int32_t* val = (int32_t*)&pBuf->v;
+  int32_t numOfElems = 0;
+  if (pCol->hasNull || pCtx->subsidiaries.num > 0) {
+    int32_t i = start;
+    while (i < (start + numOfRows)) {
+      if (!colDataIsNull_f(pCol->nullbitmap, i)) {
+        break;
+      }
+      i += 1;
+    }
+    if ((i < (start + numOfRows)) && (!pBuf->assign)) {
+      *val = pData[i];
+      if (pCtx->subsidiaries.num > 0) {
+        pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL);
+      }
+      pBuf->assign = true;
+      numOfElems += 1;
+    }
+    if (isMinFunc) {  // min
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        if (*val > pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    } else {  // max function
+      for (; i < start + numOfRows; ++i) {
+        if (colDataIsNull_f(pCol->nullbitmap, i)) {
+          continue;
+        }
+        // ignore the equivalent data value
+        // NOTE: An faster version to avoid one additional comparison with FPU.
+        if (*val < pData[i]) {
+          *val = pData[i];
+          if (pCtx->subsidiaries.num > 0) {
+            updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos);
+          }
+        }
+        numOfElems += 1;
+      }
+    }
+  } else {  // not has null value
+            // AVX2 version to speedup the loop
+    if (!pBuf->assign) {
+      *val = pData[0];
+      pBuf->assign = true;
+    }
+    if (isMinFunc) {  // min
+      for (int32_t i = start; i < start + numOfRows; ++i) {
+        if (*val > pData[i]) {
+          *val = pData[i];
+        }
+      }
+    } else {  // max
+      for (int32_t i = start; i < start + numOfRows; ++i) {
+        if (*val < pData[i]) {
+          *val = pData[i];
+        }
+      }
+    }
+    numOfElems = numOfRows;
+  }
+  return numOfElems;
+}
 static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
                              SMinmaxResInfo* pBuf, bool isMinFunc) {
  float* pData = (float*)pCol->pData;
@@ -445,13 +699,13 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO
  return numOfElems;
 }
-static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
+static int32_t handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
-                             SMinmaxResInfo* pBuf, bool isMinFunc) {
+                              SMinmaxResInfo* pBuf, bool isMinFunc) {
-  int8_t* pData = (int8_t*)pCol->pData;
+  float* pData = (float*)pCol->pData;
-  int8_t* val = (int8_t*)&pBuf->v;
+  double* val = (double*)&pBuf->v;
  int32_t numOfElems = 0;
-  if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) {
+  if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) {
    int32_t i = start;
    while (i < (start + numOfRows)) {
      if (!colDataIsNull_f(pCol->nullbitmap, i)) {
@@ -483,12 +737,12 @@ static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOf
        }
        numOfElems += 1;
      }
    } else {  // max function
      for (; i < start + numOfRows; ++i) {
        if (colDataIsNull_f(pCol->nullbitmap, i)) {
          continue;
        }
        // ignore the equivalent data value
        // NOTE: An faster version to avoid one additional comparison with FPU.
        if (*val < pData[i]) {
@@ -499,12 +753,11 @@ static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOf
        }
        numOfElems += 1;
      }
    }
  } else {  // not has null value
-    // AVX2 version to speedup the loop
+    // AVX version to speedup the loop
-    if (tsAVX2Enable && tsSIMDEnable) {
+    if (tsAVXEnable && tsSIMDEnable) {
-      *val = i8VectorCmpAVX2(pData, numOfRows, isMinFunc);
+      *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc);
    } else {
      if (!pBuf->assign) {
        *val = pData[0];
@@ -660,6 +913,7 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) {
    if (type == TSDB_DATA_TYPE_TINYINT || type == TSDB_DATA_TYPE_BOOL) {
      numOfElems = handleInt8Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc);
    } else if (type == TSDB_DATA_TYPE_SMALLINT) {
+      numOfElems = handleInt16Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc);
      int16_t* pData = (int16_t*)pCol->pData;
      int16_t* val = (int16_t*)&pBuf->v;