refactor: do some internal refactor.

b83f8957 · Haojun Liao · ea83ae23 · b83f8957 · b83f8957 · b83f8957
17 changed file
--- a/cmake/cmake.define
+++ b/cmake/cmake.define
@@ -123,14 +123,20 @@ ELSE ()
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-literal-suffix -Werror=return-type -fPIC -gdwarf-2 -g3 -Wformat=2 -Wno-format-nonliteral -Wno-format-truncation -Wno-format-y2k")
    ENDIF ()

-    MESSAGE("System processor ID: ${CMAKE_SYSTEM_PROCESSOR}")
    IF (TD_INTEL_64 OR TD_INTEL_32)
-        ADD_DEFINITIONS("-msse4.2 -mavx -mavx2")
+        ADD_DEFINITIONS("-msse4.2")
        IF("${FMA_SUPPORT}" MATCHES "true")
-            MESSAGE(STATUS "turn fma function support on")
+            MESSAGE(STATUS "fma function supported")
            ADD_DEFINITIONS("-mfma")
        ELSE ()
-            MESSAGE(STATUS "turn fma function support off")
+            MESSAGE(STATUS "fma function NOT supported")
+        ENDIF()
+
+        IF("${SIMD_SUPPORT}" MATCHES "true")
+            ADD_DEFINITIONS("-mavx -mavx2")
+            MESSAGE(STATUS "cpu simd instruction AVX/AVX2 supported")
+        ELSE()
+            MESSAGE(STATUS "cpu simd instruction AVX/AVX2 NOT supported")
        ENDIF()
    ENDIF ()


--- a/cmake/cmake.platform
+++ b/cmake/cmake.platform
 cmake_minimum_required(VERSION 3.0)

-MESSAGE("Current system is ${CMAKE_SYSTEM_NAME}")
-
 # init
 SET(TD_LINUX FALSE)
 SET(TD_WINDOWS FALSE)
 SET(TD_DARWIN FALSE)

-MESSAGE("Compiler ID: ${CMAKE_CXX_COMPILER_ID}")
 if(CMAKE_COMPILER_IS_GNUCXX MATCHES 1)
    set(CXX_COMPILER_IS_GNU TRUE)
 else()
    set(CXX_COMPILER_IS_GNU FALSE)
 endif()

-MESSAGE("Current system name is ${CMAKE_SYSTEM_NAME}.")
+MESSAGE("Current system: ${CMAKE_SYSTEM_NAME}")

 IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")

@@ -26,6 +23,8 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin
        set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS} -undefined dynamic_lookup")
    ENDIF ()

+    MESSAGE("Current system processor: ${CMAKE_SYSTEM_PROCESSOR}")
+
    IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux")

        SET(TD_LINUX TRUE)
@@ -44,7 +43,6 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin
        SET(OSTYPE "macOS")
        ADD_DEFINITIONS("-DDARWIN -Wno-tautological-pointer-compare")

-        MESSAGE("Current system processor is ${CMAKE_SYSTEM_PROCESSOR}.")
        IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
            MESSAGE("Current system arch is arm64")
            SET(TD_DARWIN_64 TRUE)
@@ -80,24 +78,22 @@ ELSEIF (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
 ENDIF()

 IF ("${CPUTYPE}" STREQUAL "")
-  MESSAGE(STATUS "The current platform " ${CMAKE_SYSTEM_PROCESSOR} " is detected")
-
  IF (CMAKE_SYSTEM_PROCESSOR MATCHES "(amd64)|(AMD64)")
-    MESSAGE(STATUS "The current platform is amd64")
+    MESSAGE(STATUS "Current platform is amd64")
    SET(PLATFORM_ARCH_STR "amd64")
    SET(TD_INTEL_64 TRUE)
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)")
-    MESSAGE(STATUS "The current platform is x86")
+    MESSAGE(STATUS "Current platform is x86")
    SET(PLATFORM_ARCH_STR "i386")
    SET(TD_INTEL_32 TRUE)
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
-    MESSAGE(STATUS "The current platform is aarch32")
+    MESSAGE(STATUS "Current platform is aarch32")
    SET(PLATFORM_ARCH_STR "arm")
    SET(TD_ARM_32 TRUE)
    ADD_DEFINITIONS("-D_TD_ARM_")
    ADD_DEFINITIONS("-D_TD_ARM_32")
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)")
-    MESSAGE(STATUS "The current platform is aarch64")
+    MESSAGE(STATUS "Current platform is aarch64")
    SET(PLATFORM_ARCH_STR "arm64")
    SET(TD_ARM_64 TRUE)
    ADD_DEFINITIONS("-D_TD_ARM_")

--- a/cmake/cmake.version
+++ b/cmake/cmake.version
@@ -26,7 +26,7 @@ ELSEIF (HAVE_GIT)
    SET(TD_VER_GIT "no git commit id")
  ENDIF ()
 ELSE ()
-  message(STATUS "no git cmd")
+  message(STATUS "no git found")
  SET(TD_VER_GIT "no git commit id")
 ENDIF ()

@@ -70,9 +70,9 @@ MESSAGE(STATUS "compatible: " ${TD_VER_COMPATIBLE})
 MESSAGE(STATUS "commit id:  " ${TD_VER_GIT})
 MESSAGE(STATUS "build date: " ${TD_VER_DATE})
 MESSAGE(STATUS "build type: " ${CMAKE_BUILD_TYPE})
-MESSAGE(STATUS "type: " ${TD_VER_VERTYPE})
-MESSAGE(STATUS "cpu:  " ${TD_VER_CPUTYPE})
-MESSAGE(STATUS "os:   " ${TD_VER_OSTYPE})
+MESSAGE(STATUS "type:       " ${TD_VER_VERTYPE})
+MESSAGE(STATUS "cpu:        " ${TD_VER_CPUTYPE})
+MESSAGE(STATUS "os:         " ${TD_VER_OSTYPE})
 MESSAGE(STATUS "============= compile version parameter information end  ============= ")

 STRING(REPLACE "." "_" TD_LIB_VER_NUMBER ${TD_VER_NUMBER})
--- a/include/os/os.h
+++ b/include/os/os.h
@@ -81,6 +81,13 @@ extern "C" {
 #include <string.h>
 #include <wchar.h>
 #include <wctype.h>
+#include <cpuid.h>
+
+#if __AVX__
+#include <immintrin.h>
+#elif __SSE4_2__
+#include <nmmintrin.h>
+#endif

 #include "osThread.h"


--- a/include/os/osDef.h
+++ b/include/os/osDef.h
@@ -168,22 +168,22 @@ void syslog(int unused, const char *format, ...);
    }                            \
  } while (0)

-#define DEFAULT_DOUBLE_COMP(x, y) \
-  do {                            \
-    if (isnan(x) && isnan(y)) {   \
-      return 0;                   \
-    }                             \
-    if (isnan(x)) {               \
-      return -1;                  \
-    }                             \
-    if (isnan(y)) {               \
-      return 1;                   \
-    }                             \
-    if ((x) == (y)) {             \
-      return 0;                   \
-    } else {                      \
-      return (x) < (y) ? -1 : 1;  \
-    }                             \
+#define DEFAULT_DOUBLE_COMP(x, y)         \
+  do {                                    \
+    if (isnan(x) && isnan(y)) {           \
+      return 0;                           \
+    }                                     \
+    if (isnan(x)) {                       \
+      return -1;                          \
+    }                                     \
+    if (isnan(y)) {                       \
+      return 1;                           \
+    }                                     \
+    if (fabs((x) - (y)) <= DBL_EPSILON) { \
+      return 0;                           \
+    } else {                              \
+      return (x) < (y) ? -1 : 1;          \
+    }                                     \
  } while (0)

 #define DEFAULT_FLOAT_COMP(x, y) DEFAULT_DOUBLE_COMP(x, y)

--- a/include/os/osEnv.h
+++ b/include/os/osEnv.h
@@ -36,6 +36,11 @@ extern int64_t         tsStreamMax;
 extern float           tsNumOfCores;
 extern int64_t         tsTotalMemoryKB;
 extern char           *tsProcPath;
+extern char            tsSIMDEnable;
+extern char            tsSSE42Enable;
+extern char            tsAVXEnable;
+extern char            tsAVX2Enable;
+extern char            tsFMAEnable;

 extern char configDir[];
 extern char tsDataDir[];

--- a/include/os/osSysinfo.h
+++ b/include/os/osSysinfo.h
@@ -40,6 +40,7 @@ int32_t taosGetOsReleaseName(char *releaseName, int32_t maxLen);
 int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores);
 int32_t taosGetCpuCores(float *numOfCores);
 void    taosGetCpuUsage(double *cpu_system, double *cpu_engine);
+int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma);
 int32_t taosGetTotalMemory(int64_t *totalKB);
 int32_t taosGetProcMemory(int64_t *usedKB);
 int32_t taosGetSysMemory(int64_t *usedKB);

--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@@ -15,7 +15,6 @@

 #define _DEFAULT_SOURCE
 #include "tglobal.h"
-#include "tcompare.h"
 #include "tconfig.h"
 #include "tdatablock.h"
 #include "tgrant.h"
@@ -312,7 +311,14 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
  if (cfgAddLocale(pCfg, "locale", tsLocale) != 0) return -1;
  if (cfgAddCharset(pCfg, "charset", tsCharset) != 0) return -1;
  if (cfgAddBool(pCfg, "enableCoreFile", 1, 1) != 0) return -1;
-  if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 0, 100000, 1) != 0) return -1;
+  if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, 1) != 0) return -1;
+
+  if (cfgAddBool(pCfg, "SSE42", tsSSE42Enable, 0) != 0) return -1;
+  if (cfgAddBool(pCfg, "AVX", tsAVXEnable, 0) != 0) return -1;
+  if (cfgAddBool(pCfg, "AVX2", tsAVX2Enable, 0) != 0) return -1;
+  if (cfgAddBool(pCfg, "FMA", tsFMAEnable, 0) != 0) return -1;
+  if (cfgAddBool(pCfg, "SIMD-Supported", tsSIMDEnable, 0) != 0) return -1;
+
  if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, 1) != 0) return -1;
  if (cfgAddInt64(pCfg, "streamMax", tsStreamMax, 0, INT64_MAX, 1) != 0) return -1;
  if (cfgAddInt32(pCfg, "pageSizeKB", tsPageSizeKB, 0, INT64_MAX, 1) != 0) return -1;

--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@@ -13,7 +13,6 @@
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#include <immintrin.h>
 #include "builtinsimpl.h"
 #include "function.h"
 #include "tdatablock.h"
@@ -49,11 +48,48 @@ typedef struct SAvgRes {
  int16_t type;  // store the original input type, used in merge function
 } SAvgRes;

+static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* plist, SAvgRes* pRes) {
+#if __AVX__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t startIndex = 0;  //((uint64_t)plist) & ((1<<8u)-1);
+  int32_t bitWidth = 8;
+
+  int32_t      remain = (pInput->numOfRows - startIndex) % bitWidth;
+  int32_t      rounds = (pInput->numOfRows - startIndex) / bitWidth;
+  const float* p = &plist[startIndex];
+
+  __m256 val;
+  __m256 sum = _mm256_setzero_ps();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    val = _mm256_loadu_ps(p);
+    sum = _mm256_add_ps(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const float* q = (const float*)&sum;
+  pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];
+
+  // calculate the front and the reminder items in array list
+  for (int32_t j = 0; j < startIndex; ++j) {
+    pRes->sum.dsum += plist[j];
+  }
+
+  startIndex += rounds * bitWidth;
+  for (int32_t j = 0; j < remain; ++j) {
+    pRes->sum.dsum += plist[j + startIndex];
+  }
+#endif
+}
+
 static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) {
  int32_t numOfElems = 0;
  float*  plist = (float*)pCol->pData;

-  if (pCol->hasNull || pInput->numOfRows < 8) {
+  const int32_t THRESHOLD_SIZE = 8;
+
+  if (pCol->hasNull || pInput->numOfRows <= THRESHOLD_SIZE) {
    for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
      if (colDataIsNull_f(pCol->nullbitmap, i)) {
        continue;
@@ -67,46 +103,13 @@ static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnIn
    numOfElems = pInput->numOfRows;
    pRes->count += pInput->numOfRows;

-    // 1. an software version to speedup the process by using loop unwinding.
-
-
-
-    // 2. if both the CPU and OS support SSE4.2, let's try the faster version by using SSE4.2 SIMD
-
-
-
-    // 3. If both the CPU and OS support AVX, let's employ AVX instruction to speedup this loop
-    // 3.1 find the start position that are aligned to 32bytes address in memory
-    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
-    int32_t i = 0;
-
-    int32_t bitWidth = 8;
-
-    int32_t remain = (pInput->numOfRows - startElem) % bitWidth;
-    int32_t rounds = (pInput->numOfRows - startElem) / bitWidth;
-    const float* p = &plist[startElem];
-
-    __m256 loadVal;
-    __m256 sum = _mm256_setzero_ps();
-
-    for(; i < rounds; ++i) {
-      loadVal = _mm256_loadu_ps(p);
-      sum = _mm256_add_ps(sum, loadVal);
-      p += bitWidth;
-    }
-
-    // let sum up the final results
-    const float* q = (const float*)&sum;
-    pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];
-
-    // calculate the front and the reminder items in array list
-    for(int32_t j = 0; j < startElem; ++j) {
-      pRes->sum.dsum += plist[j];
-    }
-
-    startElem += rounds * bitWidth;
-    for(int32_t j = 0; j < remain; ++j) {
-      pRes->sum.dsum += plist[j + startElem];
+    // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+    if (tsAVXEnable && tsSIMDEnable) {
+      floatVectorSumAVX(pInput, plist, pRes);
+    } else {
+      for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+        pRes->sum.dsum += plist[i];
+      }
    }
  }


--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@@ -13,20 +13,163 @@
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#include <immintrin.h>
 #include "builtinsimpl.h"
 #include "function.h"
 #include "tdatablock.h"
 #include "tfunctionInt.h"
 #include "tglobal.h"

+static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool isMinFunc) {
+  int32_t v = 0;
+
+#if __AVX2__
+  int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
+  int32_t bitWidth = 8;
+
+  int32_t remain = (numOfRows - startElem) % bitWidth;
+  int32_t rounds = (numOfRows - startElem) / bitWidth;
+  const int32_t* p = &pData[startElem];
+
+  __m256i next;
+  __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
+  p += bitWidth;
+
+  if (!isMinFunc) {  // max function
+    for (int32_t i = 0; i < rounds; ++i) {
+      next = _mm256_loadu_si256((__m256i*)p);
+      initialVal = _mm256_max_epi32(initialVal, next);
+      p += bitWidth;
+    }
+
+    // let sum up the final results
+    const int32_t* q = (const int32_t*)&initialVal;
+
+    v = TMAX(q[0], q[1]);
+    v = TMAX(v, q[2]);
+    v = TMAX(v, q[3]);
+    v = TMAX(v, q[4]);
+    v = TMAX(v, q[5]);
+    v = TMAX(v, q[6]);
+    v = TMAX(v, q[7]);
+
+    // calculate the front and the reminder items in array list
+    startElem += rounds * bitWidth;
+    for (int32_t j = 0; j < remain; ++j) {
+      if (v < p[j + startElem]) {
+        v = p[j + startElem];
+      }
+    }
+  } else {  // min function
+    for (int32_t i = 0; i < rounds; ++i) {
+      next = _mm256_loadu_si256((__m256i*)p);
+      initialVal = _mm256_min_epi32(initialVal, next);
+      p += bitWidth;
+    }
+
+    // let sum up the final results
+    const int32_t* q = (const int32_t*)&initialVal;
+
+    v = TMIN(q[0], q[1]);
+    v = TMIN(v, q[2]);
+    v = TMIN(v, q[3]);
+    v = TMIN(v, q[4]);
+    v = TMIN(v, q[5]);
+    v = TMIN(v, q[6]);
+    v = TMIN(v, q[7]);
+
+    // calculate the front and the remainder items in array list
+    startElem += rounds * bitWidth;
+    for (int32_t j = 0; j < remain; ++j) {
+      if (v > p[j + startElem]) {
+        v = p[j + startElem];
+      }
+    }
+  }
+#endif
+
+  return v;
+}
+
+static float floatVectorCmpAVX(const float* pData, int32_t numOfRows, bool isMinFunc) {
+  float v = 0;
+
+#if __AVX__
+  int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
+  int32_t i = 0;
+
+  int32_t bitWidth = 8;
+
+  int32_t remain = (numOfRows - startElem) % bitWidth;
+  int32_t rounds = (numOfRows - startElem) / bitWidth;
+  const float* p = &pData[startElem];
+
+  __m256 next;
+  __m256 initialVal = _mm256_loadu_ps(p);
+  p += bitWidth;
+
+  if (!isMinFunc) {  // max function
+    for (; i < rounds; ++i) {
+      next = _mm256_loadu_ps(p);
+      initialVal = _mm256_max_ps(initialVal, next);
+      p += bitWidth;
+    }
+
+    // let sum up the final results
+    const float* q = (const float*)&initialVal;
+
+    v = TMAX(q[0], q[1]);
+    v = TMAX(v, q[2]);
+    v = TMAX(v, q[3]);
+    v = TMAX(v, q[4]);
+    v = TMAX(v, q[5]);
+    v = TMAX(v, q[6]);
+    v = TMAX(v, q[7]);
+
+    // calculate the front and the reminder items in array list
+    startElem += rounds * bitWidth;
+    for (int32_t j = 0; j < remain; ++j) {
+      if (v < p[j + startElem]) {
+        v = p[j + startElem];
+      }
+    }
+  } else {  // min function
+    for (; i < rounds; ++i) {
+      next = _mm256_loadu_ps(p);
+      initialVal = _mm256_min_ps(initialVal, next);
+      p += bitWidth;
+    }
+
+    // let sum up the final results
+    const float* q = (const float*)&initialVal;
+
+    v = TMIN(q[0], q[1]);
+    v = TMIN(v, q[2]);
+    v = TMIN(v, q[3]);
+    v = TMIN(v, q[4]);
+    v = TMIN(v, q[5]);
+    v = TMIN(v, q[6]);
+    v = TMIN(v, q[7]);
+
+    // calculate the front and the reminder items in array list
+    startElem += rounds * bitWidth;
+    for (int32_t j = 0; j < remain; ++j) {
+      if (v > p[j + startElem]) {
+        v = p[j + startElem];
+      }
+    }
+  }
+#endif
+
+  return v;
+}
+
 static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
                              SMinmaxResInfo* pBuf, bool isMinFunc) {
  int32_t* pData = (int32_t*)pCol->pData;
  int32_t* val = (int32_t*)&pBuf->v;

  int32_t numOfElems = 0;
-  if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) {
+  if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) {
    if (isMinFunc) {  // min
      for (int32_t i = start; i < start + numOfRows; ++i) {
        if (colDataIsNull_f(pCol->nullbitmap, i)) {
@@ -77,79 +220,30 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO
      }
    }
  } else { // not has null value
-    // 1. software version
-
-
-
-
-    // 3. AVX2 version to speedup the loop
-    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
-    int32_t i = 0;
-
-    int32_t bitWidth = 8;
-    int32_t v = 0;
-
-    int32_t remain = (numOfRows - startElem) % bitWidth;
-    int32_t rounds = (numOfRows - startElem) / bitWidth;
-    const int32_t* p = &pData[startElem];
-
-    __m256i next;
-    __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
-    p += bitWidth;
-
-    if (!isMinFunc) {  // max function
-      for (; i < rounds; ++i) {
-        next = _mm256_loadu_si256((__m256i*)p);
-        initialVal = _mm256_max_epi32(initialVal, next);
-        p += bitWidth;
+    // AVX2 version to speedup the loop
+    if (tsAVX2Enable && tsSIMDEnable) {
+      *val = i32VectorCmpAVX2(pData, numOfRows, isMinFunc);
+    } else {
+      if (!pBuf->assign) {
+        *val = pData[0];
+        pBuf->assign = true;
      }

-      // let sum up the final results
-      const int32_t* q = (const int32_t*)&initialVal;
-
-      v = TMAX(q[0], q[1]);
-      v = TMAX(v, q[2]);
-      v = TMAX(v, q[3]);
-      v = TMAX(v, q[4]);
-      v = TMAX(v, q[5]);
-      v = TMAX(v, q[6]);
-      v = TMAX(v, q[7]);
-
-      // calculate the front and the reminder items in array list
-      startElem += rounds * bitWidth;
-      for (int32_t j = 0; j < remain; ++j) {
-        if (v < p[j + startElem]) {
-          v = p[j + startElem];
+      if (isMinFunc) {  // min
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val > pData[i]) {
+            *val = pData[i];
+          }
        }
-      }
-    } else {  // min function
-      for (; i < rounds; ++i) {
-        next = _mm256_loadu_si256((__m256i*)p);
-        initialVal = _mm256_min_epi32(initialVal, next);
-        p += bitWidth;
-      }
-
-      // let sum up the final results
-      const int32_t* q = (const int32_t*)&initialVal;
-
-      v = TMIN(q[0], q[1]);
-      v = TMIN(v, q[2]);
-      v = TMIN(v, q[3]);
-      v = TMIN(v, q[4]);
-      v = TMIN(v, q[5]);
-      v = TMIN(v, q[6]);
-      v = TMIN(v, q[7]);
-
-      // calculate the front and the reminder items in array list
-      startElem += rounds * bitWidth;
-      for (int32_t j = 0; j < remain; ++j) {
-        if (v > p[j + startElem]) {
-          v = p[j + startElem];
+      } else {  // max
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val < pData[i]) {
+            *val = pData[i];
+          }
        }
      }
    }

-    *val = v;
    numOfElems = numOfRows;
  }

@@ -213,79 +307,30 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO
      }
    }
  } else { // not has null value
-    // 1. software version
-
-
-
-
-    // 3. AVX2 version to speedup the loop
-    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
-    int32_t i = 0;
-
-    int32_t bitWidth = 8;
-    float v = 0;
-
-    int32_t remain = (numOfRows - startElem) % bitWidth;
-    int32_t rounds = (numOfRows - startElem) / bitWidth;
-    const float* p = &pData[startElem];
-
-    __m256 next;
-    __m256 initialVal = _mm256_loadu_ps(p);
-    p += bitWidth;
-
-    if (!isMinFunc) {  // max function
-      for (; i < rounds; ++i) {
-        next = _mm256_loadu_ps(p);
-        initialVal = _mm256_max_ps(initialVal, next);
-        p += bitWidth;
+    // AVX version to speedup the loop
+    if (tsAVXEnable && tsSIMDEnable) {
+      *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc);
+    } else {
+      if (!pBuf->assign) {
+        *val = pData[0];
+        pBuf->assign = true;
      }

-      // let sum up the final results
-      const float* q = (const float*)&initialVal;
-
-      v = TMAX(q[0], q[1]);
-      v = TMAX(v, q[2]);
-      v = TMAX(v, q[3]);
-      v = TMAX(v, q[4]);
-      v = TMAX(v, q[5]);
-      v = TMAX(v, q[6]);
-      v = TMAX(v, q[7]);
-
-      // calculate the front and the reminder items in array list
-      startElem += rounds * bitWidth;
-      for (int32_t j = 0; j < remain; ++j) {
-        if (v < p[j + startElem]) {
-          v = p[j + startElem];
+      if (isMinFunc) {  // min
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val > pData[i]) {
+            *val = pData[i];
+          }
        }
-      }
-    } else {  // min function
-      for (; i < rounds; ++i) {
-        next = _mm256_loadu_ps(p);
-        initialVal = _mm256_min_ps(initialVal, next);
-        p += bitWidth;
-      }
-
-      // let sum up the final results
-      const float* q = (const float*)&initialVal;
-
-      v = TMIN(q[0], q[1]);
-      v = TMIN(v, q[2]);
-      v = TMIN(v, q[3]);
-      v = TMIN(v, q[4]);
-      v = TMIN(v, q[5]);
-      v = TMIN(v, q[6]);
-      v = TMIN(v, q[7]);
-
-      // calculate the front and the reminder items in array list
-      startElem += rounds * bitWidth;
-      for (int32_t j = 0; j < remain; ++j) {
-        if (v > p[j + startElem]) {
-          v = p[j + startElem];
+      } else {  // max
+        for (int32_t i = start; i < start + numOfRows; ++i) {
+          if (*val < pData[i]) {
+            *val = pData[i];
+          }
        }
      }
    }

-    *val = v;
    numOfElems = numOfRows;
  }


--- a/source/os/src/osEnv.c
+++ b/source/os/src/osEnv.c
@@ -37,6 +37,12 @@ float           tsNumOfCores = 0;
 int64_t         tsTotalMemoryKB = 0;
 char           *tsProcPath = NULL;

+char            tsSIMDEnable = 0;
+char            tsSSE42Enable = 0;
+char            tsAVXEnable = 0;
+char            tsAVX2Enable = 0;
+char            tsFMAEnable = 0;
+
 void osDefaultInit() {
  taosSeedRand(taosSafeRand());
  taosGetSystemLocale(tsLocale, tsCharset);
@@ -99,7 +105,7 @@ bool osDataSpaceSufficient() { return tsDataSpace.size.avail > tsDataSpace.reser

 bool osTempSpaceSufficient() { return tsTempSpace.size.avail > tsTempSpace.reserved; }

-void osSetTimezone(const char *timezone) { taosSetSystemTimezone(timezone, tsTimezoneStr, &tsDaylight, &tsTimezone); }
+void osSetTimezone(const char *tz) { taosSetSystemTimezone(tz, tsTimezoneStr, &tsDaylight, &tsTimezone); }

 void osSetSystemLocale(const char *inLocale, const char *inCharSet) {
  memcpy(tsLocale, inLocale, strlen(inLocale) + 1);

--- a/source/os/src/osFile.c
+++ b/source/os/src/osFile.c
@@ -775,6 +775,7 @@ int64_t taosGetLineFile(TdFilePtr pFile, char **__restrict ptrBuf) {
  return getline(ptrBuf, &len, pFile->fp);
 #endif
 }
+
 int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) {
  if (pFile == NULL || buf == NULL) {
    return -1;
@@ -785,6 +786,7 @@ int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) {
  }
  return strlen(buf);
 }
+
 int32_t taosEOFFile(TdFilePtr pFile) {
  if (pFile == NULL) {
    return 0;

--- a/source/os/src/osLocale.c
+++ b/source/os/src/osLocale.c
@@ -67,6 +67,9 @@ char *taosCharsetReplace(char *charsetstr) {
 }

 /**
+ * TODO: here we may employ the systemctl API to set/get the correct locale on the Linux. In some cases, the setlocale
+ *  seems does not response as expected.
+ *
 * In some Linux systems, setLocale(LC_CTYPE, "") may return NULL, in which case the launch of
 * both the TDengine Server and the Client may be interrupted.
 *
@@ -148,7 +151,7 @@ void taosGetSystemLocale(char *outLocale, char *outCharset) {
   *
   * example: en_US.UTF-8, zh_CN.GB18030, zh_CN.UTF-8,
   *
-   * if user does not specify the locale in taos.cfg the program use default LC_CTYPE as system locale.
+   * If user does not specify the locale in taos.cfg, the program then uses default LC_CTYPE as system locale.
   *
   * In case of some CentOS systems, their default locale is "en_US.utf8", which is not valid code_page
   * for libiconv that is employed to convert string in this system. This program will automatically use

--- a/source/os/src/osSysinfo.c
+++ b/source/os/src/osSysinfo.c
@@ -155,8 +155,8 @@ static int32_t taosGetSysCpuInfo(SysCpuInfo *cpuInfo) {
  }

  char    line[1024];
-  ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line);
-  if ((_bytes < 0) || (line == NULL)) {
+  ssize_t bytes = taosGetsFile(pFile, sizeof(line), line);
+  if (bytes < 0) {
    taosCloseFile(&pFile);
    return -1;
  }
@@ -193,9 +193,9 @@ static int32_t taosGetProcCpuInfo(ProcCpuInfo *cpuInfo) {
    return -1;
  }

-  char    line[1024];
-  ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line);
-  if ((_bytes < 0) || (line == NULL)) {
+  char    line[1024] = {0};
+  ssize_t bytes = taosGetsFile(pFile, sizeof(line), line);
+  if (bytes < 0) {
    taosCloseFile(&pFile);
    return -1;
  }
@@ -239,6 +239,7 @@ void taosGetSystemInfo() {
  taosGetCpuCores(&tsNumOfCores);
  taosGetTotalMemory(&tsTotalMemoryKB);
  taosGetCpuUsage(NULL, NULL);
+  taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable);
 #endif
 }

@@ -366,7 +367,7 @@ int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores) {

  return code;
 #else
-  char    line[1024];
+  char    line[1024] = {0};
  size_t  size = 0;
  int32_t done = 0;
  int32_t code = -1;
@@ -468,6 +469,46 @@ void taosGetCpuUsage(double *cpu_system, double *cpu_engine) {
  }
 }

+#define __cpuid_fix(level, a, b, c, d) \
+              __asm__("xor %%ecx, %%ecx\n" \
+                      "cpuid\n" \
+                      : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \
+                      : "0"(level))
+
+// todo add for windows and mac
+int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma) {
+#ifdef WINDOWS
+#elif defined(_TD_DARWIN_64)
+#else
+
+  // Since the compiler is not support avx/avx2 instructions, the global variables always need to be
+  // set to be false
+#if __AVX__ || __AVX2__
+  tsSIMDEnable = true;
+#else
+  tsSIMDEnable = false;
+#endif
+
+  uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+
+  int32_t ret = __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+  if (ret == 0) {
+    return -1;  // failed to get the cpuid info
+  }
+
+  *sse42 = (char) ((ecx & bit_SSE4_2) == bit_SSE4_2);
+  *avx   = (char) ((ecx & bit_AVX) == bit_AVX);
+  *fma   = (char) ((ecx & bit_FMA) == bit_FMA);
+
+  // work around a bug in GCC.
+  // Ref to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77756
+  __cpuid_fix(7u, eax, ebx, ecx, edx);
+  *avx2 = (char) ((ebx & bit_AVX2) == bit_AVX2);
+  return 0;
+
+#endif
+}
+
 int32_t taosGetTotalMemory(int64_t *totalKB) {
 #ifdef WINDOWS
  MEMORYSTATUSEX memsStat;
@@ -511,11 +552,11 @@ int32_t taosGetProcMemory(int64_t *usedKB) {
    return -1;
  }

-  ssize_t _bytes = 0;
-  char    line[1024];
+  ssize_t bytes = 0;
+  char    line[1024] = {0};
  while (!taosEOFFile(pFile)) {
-    _bytes = taosGetsFile(pFile, sizeof(line), line);
-    if ((_bytes < 0) || (line == NULL)) {
+    bytes = taosGetsFile(pFile, sizeof(line), line);
+    if (bytes < 0) {
      break;
    }
    if (strstr(line, "VmRSS:") != NULL) {
@@ -523,7 +564,7 @@ int32_t taosGetProcMemory(int64_t *usedKB) {
    }
  }

-  if (line == NULL) {
+  if (strlen(line) < 0) {
    // printf("read file:%s failed", tsProcMemFile);
    taosCloseFile(&pFile);
    return -1;
@@ -624,14 +665,14 @@ int32_t taosGetProcIO(int64_t *rchars, int64_t *wchars, int64_t *read_bytes, int
  TdFilePtr pFile = taosOpenFile(tsProcIOFile, TD_FILE_READ | TD_FILE_STREAM);
  if (pFile == NULL) return -1;

-  ssize_t _bytes = 0;
-  char    line[1024];
+  ssize_t bytes = 0;
+  char    line[1024] = {0};
  char    tmp[24];
  int     readIndex = 0;

  while (!taosEOFFile(pFile)) {
-    _bytes = taosGetsFile(pFile, sizeof(line), line);
-    if (_bytes < 10 || line == NULL) {
+    bytes = taosGetsFile(pFile, sizeof(line), line);
+    if (bytes < 10) {
      break;
    }
    if (strstr(line, "rchar:") != NULL) {

--- a/source/os/src/osTime.c
+++ b/source/os/src/osTime.c
@@ -339,7 +339,7 @@ char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm) {
 #endif
 }

-FORCE_INLINE int32_t taosGetTimeOfDay(struct timeval *tv) {
+int32_t taosGetTimeOfDay(struct timeval *tv) {
 #ifdef WINDOWS
  time_t t;
  t = taosGetTimestampSec();
@@ -455,6 +455,7 @@ static int isLeapYear(time_t year) {
  else
    return 1;
 }
+
 struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst) {
  if (result == NULL) {
    return localtime(timep);
@@ -542,7 +543,9 @@ struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst)
 #endif
  return result;
 }
+
 int32_t taosGetTimestampSec() { return (int32_t)time(NULL); }
+
 int32_t taosClockGetTime(int clock_id, struct timespec *pTS) {
 #ifdef WINDOWS
  LARGE_INTEGER        t;

--- a/source/util/src/tconfig.c
+++ b/source/util/src/tconfig.c
@@ -561,13 +561,13 @@ void cfgDumpCfg(SConfig *pCfg, bool tsc, bool dump) {
    if (dump && strcmp(pItem->name, "scriptDir") == 0) continue;
    if (dump && strcmp(pItem->name, "simDebugFlag") == 0) continue;
    tstrncpy(src, cfgStypeStr(pItem->stype), CFG_SRC_PRINT_LEN);
-    for (int32_t i = 0; i < CFG_SRC_PRINT_LEN; ++i) {
-      if (src[i] == 0) src[i] = ' ';
+    for (int32_t j = 0; j < CFG_SRC_PRINT_LEN; ++j) {
+      if (src[j] == 0) src[j] = ' ';
    }

    tstrncpy(name, pItem->name, CFG_NAME_PRINT_LEN);
-    for (int32_t i = 0; i < CFG_NAME_PRINT_LEN; ++i) {
-      if (name[i] == 0) name[i] = ' ';
+    for (int32_t j = 0; j < CFG_NAME_PRINT_LEN; ++j) {
+      if (name[j] == 0) name[j] = ' ';
    }

    switch (pItem->dtype) {

--- a/source/util/src/tcrc32c.c
+++ b/source/util/src/tcrc32c.c
@@ -24,7 +24,6 @@
 #endif

 #include "tcrc32c.h"
-#include "tdef.h"

 #define POLY        0x82f63b78
 #define LONG_SHIFT  8192