diff --git a/include/libs/function/function.h b/include/libs/function/function.h
index 6f2a67546658228615ebe20eade9e626e9a13ed7..240772bfc27101adac022f3cb2a90bfd0fc55b7b 100644
--- a/include/libs/function/function.h
+++ b/include/libs/function/function.h
@@ -115,7 +115,7 @@ typedef struct SInputColumnInfoData {
   int32_t           startRowIndex;    // handle started row index
   int32_t           numOfRows;        // the number of rows needs to be handled
   int32_t           numOfInputCols;   // PTS is not included
-  bool              colDataAggIsSet;  // if agg is set or not
+  bool              colDataSMAIsSet;  // if agg is set or not
   SColumnInfoData  *pPTS;             // primary timestamp column
   SColumnInfoData **pData;
   SColumnDataAgg  **pColumnDataAgg;
diff --git a/source/libs/executor/src/executorimpl.c b/source/libs/executor/src/executorimpl.c
index f16462268755d5e4936987f83d58d697bcdb5d92..5031d75231557270abccf745c3686c7d18df3502 100644
--- a/source/libs/executor/src/executorimpl.c
+++ b/source/libs/executor/src/executorimpl.c
@@ -349,13 +349,13 @@ typedef struct {
 } SFunctionCtxStatus;
 
 static void functionCtxSave(SqlFunctionCtx* pCtx, SFunctionCtxStatus* pStatus) {
-  pStatus->hasAgg = pCtx->input.colDataAggIsSet;
+  pStatus->hasAgg = pCtx->input.colDataSMAIsSet;
   pStatus->numOfRows = pCtx->input.numOfRows;
   pStatus->startOffset = pCtx->input.startRowIndex;
 }
 
 static void functionCtxRestore(SqlFunctionCtx* pCtx, SFunctionCtxStatus* pStatus) {
-  pCtx->input.colDataAggIsSet = pStatus->hasAgg;
+  pCtx->input.colDataSMAIsSet = pStatus->hasAgg;
   pCtx->input.numOfRows = pStatus->numOfRows;
   pCtx->input.startRowIndex = pStatus->startOffset;
 }
@@ -372,8 +372,8 @@ void doApplyFunctions(SExecTaskInfo* taskInfo, SqlFunctionCtx* pCtx, SColumnInfo
 
     // not a whole block involved in query processing, statistics data can not be used
     // NOTE: the original value of isSet have been changed here
-    if (pCtx[k].input.colDataAggIsSet && forwardStep < numOfTotal) {
-      pCtx[k].input.colDataAggIsSet = false;
+    if (pCtx[k].input.colDataSMAIsSet && forwardStep < numOfTotal) {
+      pCtx[k].input.colDataSMAIsSet = false;
     }
 
     if (fmIsWindowPseudoColumnFunc(pCtx[k].functionId)) {
@@ -486,7 +486,7 @@ static int32_t doSetInputDataBlock(SExprSupp* pExprSup, SSDataBlock* pBlock, int
 
     SInputColumnInfoData* pInput = &pCtx[i].input;
     pInput->uid = pBlock->info.uid;
-    pInput->colDataAggIsSet = false;
+    pInput->colDataSMAIsSet = false;
 
     SExprInfo* pOneExpr = &pExprSup->pExprInfo[i];
     for (int32_t j = 0; j < pOneExpr->base.numOfParams; ++j) {
@@ -798,7 +798,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
   pInput->totalRows = numOfRows;
 
   if (pBlock->pBlockAgg != NULL) {
-    pInput->colDataAggIsSet = true;
+    pInput->colDataSMAIsSet = true;
 
     for (int32_t j = 0; j < pExprInfo->base.numOfParams; ++j) {
       SFunctParam* pFuncParam = &pExprInfo->base.pParam[j];
@@ -807,7 +807,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
         int32_t slotId = pFuncParam->pCol->slotId;
         pInput->pColumnDataAgg[j] = pBlock->pBlockAgg[slotId];
         if (pInput->pColumnDataAgg[j] == NULL) {
-          pInput->colDataAggIsSet = false;
+          pInput->colDataSMAIsSet = false;
         }
 
         // Here we set the column info data since the data type for each column data is required, but
@@ -818,7 +818,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
       }
     }
   } else {
-    pInput->colDataAggIsSet = false;
+    pInput->colDataSMAIsSet = false;
   }
 }
 
diff --git a/source/libs/function/src/builtinsimpl.c b/source/libs/function/src/builtinsimpl.c
index 26f9c3ad0b33d90f79dfe7d50fdf9783ccdc4991..bf79cb5191e3ca3d9f2017bbf7d9a082823cf7fb 100644
--- a/source/libs/function/src/builtinsimpl.c
+++ b/source/libs/function/src/builtinsimpl.c
@@ -498,13 +498,13 @@ static int32_t getNumOfElems(SqlFunctionCtx* pCtx) {
   int32_t numOfElem = 0;
 
   /*
-   * 1. column data missing (schema modified) causes pInputCol->hasNull == true. pInput->colDataAggIsSet == true;
-   * 2. for general non-primary key columns, pInputCol->hasNull may be true or false, pInput->colDataAggIsSet == true;
-   * 3. for primary key column, pInputCol->hasNull always be false, pInput->colDataAggIsSet == false;
+   * 1. column data missing (schema modified) causes pInputCol->hasNull == true. pInput->colDataSMAIsSet == true;
+   * 2. for general non-primary key columns, pInputCol->hasNull may be true or false, pInput->colDataSMAIsSet == true;
+   * 3. for primary key column, pInputCol->hasNull always be false, pInput->colDataSMAIsSet == false;
    */
   SInputColumnInfoData* pInput = &pCtx->input;
   SColumnInfoData*      pInputCol = pInput->pData[0];
-  if (pInput->colDataAggIsSet && pInput->totalRows == pInput->numOfRows) {
+  if (pInput->colDataSMAIsSet && pInput->totalRows == pInput->numOfRows) {
     numOfElem = pInput->numOfRows - pInput->pColumnDataAgg[0]->numOfNull;
     ASSERT(numOfElem >= 0);
   } else {
@@ -593,7 +593,7 @@ int32_t sumFunction(SqlFunctionCtx* pCtx) {
     goto _sum_over;
   }
 
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
     numOfElem = pInput->numOfRows - pAgg->numOfNull;
     ASSERT(numOfElem >= 0);
 
@@ -658,7 +658,7 @@ int32_t sumInvertFunction(SqlFunctionCtx* pCtx) {
 
   SSumRes* pSumRes = GET_ROWCELL_INTERBUF(GET_RES_INFO(pCtx));
 
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
     numOfElem = pInput->numOfRows - pAgg->numOfNull;
     ASSERT(numOfElem >= 0);
 
@@ -770,7 +770,7 @@ bool getSumFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) {
 //    goto _avg_over;
 //  }
 //
-//  if (pInput->colDataAggIsSet) {
+//  if (pInput->colDataSMAIsSet) {
 //    numOfElem = numOfRows - pAgg->numOfNull;
 //    ASSERT(numOfElem >= 0);
 //
@@ -1161,7 +1161,7 @@ bool getMinmaxFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) {
 //  }
 //
 //  // data in current data block are qualified to the query
-//  if (pInput->colDataAggIsSet) {
+//  if (pInput->colDataSMAIsSet) {
 //    numOfElems = pInput->numOfRows - pAgg->numOfNull;
 //    ASSERT(pInput->numOfRows == pInput->totalRows && numOfElems >= 0);
 //    if (numOfElems == 0) {
@@ -2471,7 +2471,7 @@ int32_t percentileFunction(SqlFunctionCtx* pCtx) {
 
   // the first stage, only acquire the min/max value
   if (pInfo->stage == 0) {
-    if (pCtx->input.colDataAggIsSet) {
+    if (pCtx->input.colDataSMAIsSet) {
       double tmin = 0.0, tmax = 0.0;
       if (IS_SIGNED_NUMERIC_TYPE(type)) {
         tmin = (double)GET_INT64_VAL(&pAgg->min);
@@ -2933,14 +2933,14 @@ int32_t firstFunction(SqlFunctionCtx* pCtx) {
   pInfo->bytes = pInputCol->info.bytes;
 
   // All null data column, return directly.
-  if (pInput->colDataAggIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
+  if (pInput->colDataSMAIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
     ASSERT(pInputCol->hasNull == true);
     // save selectivity value for column consisted of all null values
     firstlastSaveTupleData(pCtx->pSrcBlock, pInput->startRowIndex, pCtx, pInfo);
     return 0;
   }
 
-  SColumnDataAgg* pColAgg = (pInput->colDataAggIsSet) ? pInput->pColumnDataAgg[0] : NULL;
+  SColumnDataAgg* pColAgg = (pInput->colDataSMAIsSet) ? pInput->pColumnDataAgg[0] : NULL;
 
   TSKEY startKey = getRowPTs(pInput->pPTS, 0);
   TSKEY endKey = getRowPTs(pInput->pPTS, pInput->totalRows - 1);
@@ -3037,14 +3037,14 @@ int32_t lastFunction(SqlFunctionCtx* pCtx) {
   pInfo->bytes = bytes;
 
   // All null data column, return directly.
-  if (pInput->colDataAggIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
+  if (pInput->colDataSMAIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
     ASSERT(pInputCol->hasNull == true);
     // save selectivity value for column consisted of all null values
     firstlastSaveTupleData(pCtx->pSrcBlock, pInput->startRowIndex, pCtx, pInfo);
     return 0;
   }
 
-  SColumnDataAgg* pColAgg = (pInput->colDataAggIsSet) ? pInput->pColumnDataAgg[0] : NULL;
+  SColumnDataAgg* pColAgg = (pInput->colDataSMAIsSet) ? pInput->pColumnDataAgg[0] : NULL;
 
   TSKEY startKey = getRowPTs(pInput->pPTS, 0);
   TSKEY endKey = getRowPTs(pInput->pPTS, pInput->totalRows - 1);
@@ -3988,7 +3988,7 @@ int32_t spreadFunction(SqlFunctionCtx* pCtx) {
 
   SSpreadInfo* pInfo = GET_ROWCELL_INTERBUF(GET_RES_INFO(pCtx));
 
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
     numOfElems = pInput->numOfRows - pAgg->numOfNull;
     if (numOfElems == 0) {
       goto _spread_over;
@@ -4163,7 +4163,7 @@ int32_t elapsedFunction(SqlFunctionCtx* pCtx) {
     goto _elapsed_over;
   }
 
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
     if (pInfo->min == TSKEY_MAX) {
       pInfo->min = GET_INT64_VAL(&pAgg->min);
       pInfo->max = GET_INT64_VAL(&pAgg->max);
diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c
index 01e0a499eb6e7c2babada5c4845b4739828f0b74..9c3b9cf573c07344e6161e9e57d645b62cf0d6f9 100644
--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@@ -48,15 +48,14 @@ typedef struct SAvgRes {
   int16_t type;  // store the original input type, used in merge function
 } SAvgRes;
 
-static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* plist, SAvgRes* pRes) {
+static void floatVectorSumAVX(const float* plist, int32_t numOfRows, SAvgRes* pRes) {
 #if __AVX__
   // find the start position that are aligned to 32bytes address in memory
-  int32_t startIndex = 0;  //((uint64_t)plist) & ((1<<8u)-1);
   int32_t bitWidth = 8;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
 
-  int32_t      remain = (pInput->numOfRows - startIndex) % bitWidth;
-  int32_t      rounds = (pInput->numOfRows - startIndex) / bitWidth;
-  const float* p = &plist[startIndex];
+  const float* p = plist;
 
   __m256 val;
   __m256 sum = _mm256_setzero_ps();
@@ -71,18 +70,126 @@ static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* p
   const float* q = (const float*)&sum;
   pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];
 
-  // calculate the front and the reminder items in array list
-  for (int32_t j = 0; j < startIndex; ++j) {
-    pRes->sum.dsum += plist[j];
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.dsum += plist[j + startIndex];
   }
+#endif
+}
+
+static void doubleVectorSumAVX(const double* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 4;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const double* p = plist;
+
+  __m256d val;
+  __m256d sum = _mm256_setzero_pd();
 
-  startIndex += rounds * bitWidth;
-  for (int32_t j = 0; j < remain; ++j) {
+  for (int32_t i = 0; i < rounds; ++i) {
+    val = _mm256_loadu_pd(p);
+    sum = _mm256_add_pd(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const double* q = (const double*)&sum;
+  pRes->sum.dsum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
     pRes->sum.dsum += plist[j + startIndex];
   }
 #endif
 }
 
+static void i8VectorSumAVX2(const int8_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 16;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int8_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+//    __m256i extVal = _mm256_cvtepi8_epi64(val);
+    sum = _mm256_add_epi8(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int8_t* q = (const int8_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
+static void i32VectorSumAVX2(const int32_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 8;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int32_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+    sum = _mm256_add_epi32(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int64_t* q = (const int64_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
+static void i64VectorSumAVX2(const int64_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 4;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int64_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+    sum = _mm256_add_epi64(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int64_t* q = (const int64_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
 static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) {
   int32_t numOfElems = 0;
   float*  plist = (float*)pCol->pData;
@@ -105,7 +212,7 @@ static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnIn
 
     // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
     if (tsAVXEnable && tsSIMDEnable) {
-      floatVectorSumAVX(pInput, plist, pRes);
+      floatVectorSumAVX(plist, pInput->numOfRows, pRes);
     } else {
       for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
         pRes->sum.dsum += plist[i];
@@ -133,8 +240,25 @@ bool avgFunctionSetup(SqlFunctionCtx* pCtx, SResultRowEntryInfo* pResultInfo) {
   return true;
 }
 
+static int32_t calculateAvgBySMAInfo(SAvgRes* pRes, int32_t numOfRows, int32_t type, const SColumnDataAgg* pAgg) {
+  int32_t numOfElem = numOfRows - pAgg->numOfNull;
+  ASSERT(numOfElem >= 0);
+
+  pRes->count += numOfElem;
+  if (IS_SIGNED_NUMERIC_TYPE(type)) {
+    pRes->sum.isum += pAgg->sum;
+  } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) {
+    pRes->sum.usum += pAgg->sum;
+  } else if (IS_FLOAT_TYPE(type)) {
+    pRes->sum.dsum += GET_DOUBLE_VAL((const char*)&(pAgg->sum));
+  }
+
+  return numOfElem;
+}
+
 int32_t avgFunction(SqlFunctionCtx* pCtx) {
-  int32_t numOfElem = 0;
+  int32_t       numOfElem = 0;
+  const int32_t THRESHOLD_SIZE = 8;
 
   SInputColumnInfoData* pInput = &pCtx->input;
   SColumnDataAgg*       pAgg = pInput->pColumnDataAgg[0];
@@ -154,19 +278,149 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) {
     goto _avg_over;
   }
 
-  if (pInput->colDataAggIsSet) {
-    numOfElem = numOfRows - pAgg->numOfNull;
-    ASSERT(numOfElem >= 0);
-
-    pAvgRes->count += numOfElem;
-    if (IS_SIGNED_NUMERIC_TYPE(type)) {
-      pAvgRes->sum.isum += pAgg->sum;
-    } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) {
-      pAvgRes->sum.usum += pAgg->sum;
-    } else if (IS_FLOAT_TYPE(type)) {
-      pAvgRes->sum.dsum += GET_DOUBLE_VAL((const char*)&(pAgg->sum));
+  if (pInput->colDataSMAIsSet) {  // try to use SMA if available
+    numOfElem = calculateAvgBySMAInfo(pAvgRes, numOfRows, type, pAgg);
+  } else if (!pCol->hasNull) {  // try to employ the simd instructions to speed up the loop
+    numOfElem = pInput->numOfRows;
+    pAvgRes->count += pInput->numOfRows;
+
+    bool simdAvaiable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE);
+
+    switch(type) {
+      case TSDB_DATA_TYPE_TINYINT: {
+        const int8_t* plist = (const int8_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i8VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_SMALLINT: {
+        const double* plist = (const double*)pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_INT: {
+        const int32_t* plist = (const int32_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i32VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_BIGINT: {
+        const int64_t* plist = (const int64_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i64VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_FLOAT: {
+        const float* plist = (const float*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          floatVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.dsum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_DOUBLE: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.dsum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UTINYINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_USMALLINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UBIGINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      default:
+        ASSERT(0);
     }
-  } else {  // computing based on the true data block
+  } else {
     switch (type) {
       case TSDB_DATA_TYPE_TINYINT: {
         int8_t* plist = (int8_t*)pCol->pData;
diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c
index 074e5ef428044fdd9381c77a83acb036c6923216..d239315e0ec51ad0d967a9eff19bf5160e34e70f 100644
--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@@ -36,7 +36,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is
 
   if (!isMinFunc) {  // max function
     for (int32_t i = 0; i < rounds; ++i) {
-      next = _mm256_loadu_si256((__m256i*)p);
+      next = _mm256_lddqu_si256((__m256i*)p);
       initialVal = _mm256_max_epi32(initialVal, next);
       p += bitWidth;
     }
@@ -61,7 +61,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is
     }
   } else {  // min function
     for (int32_t i = 0; i < rounds; ++i) {
-      next = _mm256_loadu_si256((__m256i*)p);
+      next = _mm256_lddqu_si256((__m256i*)p);
       initialVal = _mm256_min_epi32(initialVal, next);
       p += bitWidth;
     }
@@ -369,7 +369,7 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) {
   }
 
   // data in current data block are qualified to the query
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
     numOfElems = pInput->numOfRows - pAgg->numOfNull;
     ASSERT(pInput->numOfRows == pInput->totalRows && numOfElems >= 0);
     if (numOfElems == 0) {