diff --git a/cmake/cmake.define b/cmake/cmake.define index dbd6f30b27b140b91e71c2bb5d5dc4be45fb18d1..3b6024efc8009dda67914cda87d1cc2bd3fc09db 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -123,14 +123,20 @@ ELSE () SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-literal-suffix -Werror=return-type -fPIC -gdwarf-2 -g3 -Wformat=2 -Wno-format-nonliteral -Wno-format-truncation -Wno-format-y2k") ENDIF () - MESSAGE("System processor ID: ${CMAKE_SYSTEM_PROCESSOR}") IF (TD_INTEL_64 OR TD_INTEL_32) - ADD_DEFINITIONS("-msse4.2 -mavx -mavx2") + ADD_DEFINITIONS("-msse4.2") IF("${FMA_SUPPORT}" MATCHES "true") - MESSAGE(STATUS "turn fma function support on") + MESSAGE(STATUS "fma function supported") ADD_DEFINITIONS("-mfma") ELSE () - MESSAGE(STATUS "turn fma function support off") + MESSAGE(STATUS "fma function NOT supported") + ENDIF() + + IF("${SIMD_SUPPORT}" MATCHES "true") + ADD_DEFINITIONS("-mavx -mavx2") + MESSAGE(STATUS "cpu simd instruction AVX/AVX2 supported") + ELSE() + MESSAGE(STATUS "cpu simd instruction AVX/AVX2 NOT supported") ENDIF() ENDIF () diff --git a/cmake/cmake.platform b/cmake/cmake.platform index 3e239d2e0c9f1fb53a4c156cab52801f6206df75..c3680e0de40979e9835cd8fadb4c90282d1fc7b5 100644 --- a/cmake/cmake.platform +++ b/cmake/cmake.platform @@ -1,20 +1,17 @@ cmake_minimum_required(VERSION 3.0) -MESSAGE("Current system is ${CMAKE_SYSTEM_NAME}") - # init SET(TD_LINUX FALSE) SET(TD_WINDOWS FALSE) SET(TD_DARWIN FALSE) -MESSAGE("Compiler ID: ${CMAKE_CXX_COMPILER_ID}") if(CMAKE_COMPILER_IS_GNUCXX MATCHES 1) set(CXX_COMPILER_IS_GNU TRUE) else() set(CXX_COMPILER_IS_GNU FALSE) endif() -MESSAGE("Current system name is ${CMAKE_SYSTEM_NAME}.") +MESSAGE("Current system: ${CMAKE_SYSTEM_NAME}") IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") @@ -26,6 +23,8 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS} -undefined dynamic_lookup") ENDIF () + MESSAGE("Current system processor: ${CMAKE_SYSTEM_PROCESSOR}") + IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux") SET(TD_LINUX TRUE) @@ -44,7 +43,6 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin SET(OSTYPE "macOS") ADD_DEFINITIONS("-DDARWIN -Wno-tautological-pointer-compare") - MESSAGE("Current system processor is ${CMAKE_SYSTEM_PROCESSOR}.") IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64") MESSAGE("Current system arch is arm64") SET(TD_DARWIN_64 TRUE) @@ -80,24 +78,22 @@ ELSEIF (${CMAKE_SYSTEM_NAME} MATCHES "Windows") ENDIF() IF ("${CPUTYPE}" STREQUAL "") - MESSAGE(STATUS "The current platform " ${CMAKE_SYSTEM_PROCESSOR} " is detected") - IF (CMAKE_SYSTEM_PROCESSOR MATCHES "(amd64)|(AMD64)") - MESSAGE(STATUS "The current platform is amd64") + MESSAGE(STATUS "Current platform is amd64") SET(PLATFORM_ARCH_STR "amd64") SET(TD_INTEL_64 TRUE) ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)") - MESSAGE(STATUS "The current platform is x86") + MESSAGE(STATUS "Current platform is x86") SET(PLATFORM_ARCH_STR "i386") SET(TD_INTEL_32 TRUE) ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l") - MESSAGE(STATUS "The current platform is aarch32") + MESSAGE(STATUS "Current platform is aarch32") SET(PLATFORM_ARCH_STR "arm") SET(TD_ARM_32 TRUE) ADD_DEFINITIONS("-D_TD_ARM_") ADD_DEFINITIONS("-D_TD_ARM_32") ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") - MESSAGE(STATUS "The current platform is aarch64") + MESSAGE(STATUS "Current platform is aarch64") SET(PLATFORM_ARCH_STR "arm64") SET(TD_ARM_64 TRUE) ADD_DEFINITIONS("-D_TD_ARM_") diff --git a/cmake/cmake.version b/cmake/cmake.version index 03598519ed9f0745f4f61dc4c6327b097f296243..0447f284f1ab35d8fbd19cc5ea7ac01bdd7699ac 100644 --- a/cmake/cmake.version +++ b/cmake/cmake.version @@ -26,7 +26,7 @@ ELSEIF (HAVE_GIT) SET(TD_VER_GIT "no git commit id") ENDIF () ELSE () - message(STATUS "no git cmd") + message(STATUS "no git found") SET(TD_VER_GIT "no git commit id") ENDIF () @@ -70,9 +70,9 @@ MESSAGE(STATUS "compatible: " ${TD_VER_COMPATIBLE}) MESSAGE(STATUS "commit id: " ${TD_VER_GIT}) MESSAGE(STATUS "build date: " ${TD_VER_DATE}) MESSAGE(STATUS "build type: " ${CMAKE_BUILD_TYPE}) -MESSAGE(STATUS "type: " ${TD_VER_VERTYPE}) -MESSAGE(STATUS "cpu: " ${TD_VER_CPUTYPE}) -MESSAGE(STATUS "os: " ${TD_VER_OSTYPE}) +MESSAGE(STATUS "type: " ${TD_VER_VERTYPE}) +MESSAGE(STATUS "cpu: " ${TD_VER_CPUTYPE}) +MESSAGE(STATUS "os: " ${TD_VER_OSTYPE}) MESSAGE(STATUS "============= compile version parameter information end ============= ") STRING(REPLACE "." "_" TD_LIB_VER_NUMBER ${TD_VER_NUMBER}) diff --git a/include/os/os.h b/include/os/os.h index e780611c41235b159c119780ee60cd0a0eb5593f..0334cd4d95ed866fbcd2fd7f89a61f52c29134c6 100644 --- a/include/os/os.h +++ b/include/os/os.h @@ -81,6 +81,13 @@ extern "C" { #include #include #include +#include + +#if __AVX__ +#include +#elif __SSE4_2__ +#include +#endif #include "osThread.h" diff --git a/include/os/osDef.h b/include/os/osDef.h index 297d19e21a4383e2fcc6c4f1ecf3dc86cb30f570..0bf9c6184eab2d6084ff18acda7367ac54ff8cd4 100644 --- a/include/os/osDef.h +++ b/include/os/osDef.h @@ -168,22 +168,22 @@ void syslog(int unused, const char *format, ...); } \ } while (0) -#define DEFAULT_DOUBLE_COMP(x, y) \ - do { \ - if (isnan(x) && isnan(y)) { \ - return 0; \ - } \ - if (isnan(x)) { \ - return -1; \ - } \ - if (isnan(y)) { \ - return 1; \ - } \ - if ((x) == (y)) { \ - return 0; \ - } else { \ - return (x) < (y) ? -1 : 1; \ - } \ +#define DEFAULT_DOUBLE_COMP(x, y) \ + do { \ + if (isnan(x) && isnan(y)) { \ + return 0; \ + } \ + if (isnan(x)) { \ + return -1; \ + } \ + if (isnan(y)) { \ + return 1; \ + } \ + if (fabs((x) - (y)) <= DBL_EPSILON) { \ + return 0; \ + } else { \ + return (x) < (y) ? -1 : 1; \ + } \ } while (0) #define DEFAULT_FLOAT_COMP(x, y) DEFAULT_DOUBLE_COMP(x, y) diff --git a/include/os/osEnv.h b/include/os/osEnv.h index c1fdc9e404c35dba510dafb76e2130ecbcc6ae05..a3bd209693a1bb3dc4958173d4f4a1744d13a713 100644 --- a/include/os/osEnv.h +++ b/include/os/osEnv.h @@ -36,6 +36,11 @@ extern int64_t tsStreamMax; extern float tsNumOfCores; extern int64_t tsTotalMemoryKB; extern char *tsProcPath; +extern char tsSIMDEnable; +extern char tsSSE42Enable; +extern char tsAVXEnable; +extern char tsAVX2Enable; +extern char tsFMAEnable; extern char configDir[]; extern char tsDataDir[]; diff --git a/include/os/osSysinfo.h b/include/os/osSysinfo.h index 47cdb02a6fb66be825484249434f2fa0c7a1a38e..7765a60f8833206a5b86cd38100e1d7fd884d8dd 100644 --- a/include/os/osSysinfo.h +++ b/include/os/osSysinfo.h @@ -40,6 +40,7 @@ int32_t taosGetOsReleaseName(char *releaseName, int32_t maxLen); int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores); int32_t taosGetCpuCores(float *numOfCores); void taosGetCpuUsage(double *cpu_system, double *cpu_engine); +int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma); int32_t taosGetTotalMemory(int64_t *totalKB); int32_t taosGetProcMemory(int64_t *usedKB); int32_t taosGetSysMemory(int64_t *usedKB); diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index 1be77077b6033005405293d41398c3c9c91b8b27..50b2c976fd6562a744b8525e24027d5a2f5cd329 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -15,7 +15,6 @@ #define _DEFAULT_SOURCE #include "tglobal.h" -#include "tcompare.h" #include "tconfig.h" #include "tdatablock.h" #include "tgrant.h" @@ -312,7 +311,14 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) { if (cfgAddLocale(pCfg, "locale", tsLocale) != 0) return -1; if (cfgAddCharset(pCfg, "charset", tsCharset) != 0) return -1; if (cfgAddBool(pCfg, "enableCoreFile", 1, 1) != 0) return -1; - if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 0, 100000, 1) != 0) return -1; + if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, 1) != 0) return -1; + + if (cfgAddBool(pCfg, "SSE42", tsSSE42Enable, 0) != 0) return -1; + if (cfgAddBool(pCfg, "AVX", tsAVXEnable, 0) != 0) return -1; + if (cfgAddBool(pCfg, "AVX2", tsAVX2Enable, 0) != 0) return -1; + if (cfgAddBool(pCfg, "FMA", tsFMAEnable, 0) != 0) return -1; + if (cfgAddBool(pCfg, "SIMD-Supported", tsSIMDEnable, 0) != 0) return -1; + if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, 1) != 0) return -1; if (cfgAddInt64(pCfg, "streamMax", tsStreamMax, 0, INT64_MAX, 1) != 0) return -1; if (cfgAddInt32(pCfg, "pageSizeKB", tsPageSizeKB, 0, INT64_MAX, 1) != 0) return -1; diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c index 50a69a4241ce28c0eaf2e07ba0aad5b32e6ae713..01e0a499eb6e7c2babada5c4845b4739828f0b74 100644 --- a/source/libs/function/src/detail/tavgfunction.c +++ b/source/libs/function/src/detail/tavgfunction.c @@ -13,7 +13,6 @@ * along with this program. If not, see . */ -#include #include "builtinsimpl.h" #include "function.h" #include "tdatablock.h" @@ -49,11 +48,48 @@ typedef struct SAvgRes { int16_t type; // store the original input type, used in merge function } SAvgRes; +static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* plist, SAvgRes* pRes) { +#if __AVX__ + // find the start position that are aligned to 32bytes address in memory + int32_t startIndex = 0; //((uint64_t)plist) & ((1<<8u)-1); + int32_t bitWidth = 8; + + int32_t remain = (pInput->numOfRows - startIndex) % bitWidth; + int32_t rounds = (pInput->numOfRows - startIndex) / bitWidth; + const float* p = &plist[startIndex]; + + __m256 val; + __m256 sum = _mm256_setzero_ps(); + + for (int32_t i = 0; i < rounds; ++i) { + val = _mm256_loadu_ps(p); + sum = _mm256_add_ps(sum, val); + p += bitWidth; + } + + // let sum up the final results + const float* q = (const float*)∑ + pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7]; + + // calculate the front and the reminder items in array list + for (int32_t j = 0; j < startIndex; ++j) { + pRes->sum.dsum += plist[j]; + } + + startIndex += rounds * bitWidth; + for (int32_t j = 0; j < remain; ++j) { + pRes->sum.dsum += plist[j + startIndex]; + } +#endif +} + static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) { int32_t numOfElems = 0; float* plist = (float*)pCol->pData; - if (pCol->hasNull || pInput->numOfRows < 8) { + const int32_t THRESHOLD_SIZE = 8; + + if (pCol->hasNull || pInput->numOfRows <= THRESHOLD_SIZE) { for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) { if (colDataIsNull_f(pCol->nullbitmap, i)) { continue; @@ -67,46 +103,13 @@ static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnIn numOfElems = pInput->numOfRows; pRes->count += pInput->numOfRows; - // 1. an software version to speedup the process by using loop unwinding. - - - - // 2. if both the CPU and OS support SSE4.2, let's try the faster version by using SSE4.2 SIMD - - - - // 3. If both the CPU and OS support AVX, let's employ AVX instruction to speedup this loop - // 3.1 find the start position that are aligned to 32bytes address in memory - int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1); - int32_t i = 0; - - int32_t bitWidth = 8; - - int32_t remain = (pInput->numOfRows - startElem) % bitWidth; - int32_t rounds = (pInput->numOfRows - startElem) / bitWidth; - const float* p = &plist[startElem]; - - __m256 loadVal; - __m256 sum = _mm256_setzero_ps(); - - for(; i < rounds; ++i) { - loadVal = _mm256_loadu_ps(p); - sum = _mm256_add_ps(sum, loadVal); - p += bitWidth; - } - - // let sum up the final results - const float* q = (const float*)∑ - pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7]; - - // calculate the front and the reminder items in array list - for(int32_t j = 0; j < startElem; ++j) { - pRes->sum.dsum += plist[j]; - } - - startElem += rounds * bitWidth; - for(int32_t j = 0; j < remain; ++j) { - pRes->sum.dsum += plist[j + startElem]; + // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop + if (tsAVXEnable && tsSIMDEnable) { + floatVectorSumAVX(pInput, plist, pRes); + } else { + for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) { + pRes->sum.dsum += plist[i]; + } } } diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index 7814a41f4f8360cc4cdce16a2b9b30d02506a3be..074e5ef428044fdd9381c77a83acb036c6923216 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -13,20 +13,163 @@ * along with this program. If not, see . */ -#include #include "builtinsimpl.h" #include "function.h" #include "tdatablock.h" #include "tfunctionInt.h" #include "tglobal.h" +static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool isMinFunc) { + int32_t v = 0; + +#if __AVX2__ + int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1); + int32_t bitWidth = 8; + + int32_t remain = (numOfRows - startElem) % bitWidth; + int32_t rounds = (numOfRows - startElem) / bitWidth; + const int32_t* p = &pData[startElem]; + + __m256i next; + __m256i initialVal = _mm256_loadu_si256((__m256i*)p); + p += bitWidth; + + if (!isMinFunc) { // max function + for (int32_t i = 0; i < rounds; ++i) { + next = _mm256_loadu_si256((__m256i*)p); + initialVal = _mm256_max_epi32(initialVal, next); + p += bitWidth; + } + + // let sum up the final results + const int32_t* q = (const int32_t*)&initialVal; + + v = TMAX(q[0], q[1]); + v = TMAX(v, q[2]); + v = TMAX(v, q[3]); + v = TMAX(v, q[4]); + v = TMAX(v, q[5]); + v = TMAX(v, q[6]); + v = TMAX(v, q[7]); + + // calculate the front and the reminder items in array list + startElem += rounds * bitWidth; + for (int32_t j = 0; j < remain; ++j) { + if (v < p[j + startElem]) { + v = p[j + startElem]; + } + } + } else { // min function + for (int32_t i = 0; i < rounds; ++i) { + next = _mm256_loadu_si256((__m256i*)p); + initialVal = _mm256_min_epi32(initialVal, next); + p += bitWidth; + } + + // let sum up the final results + const int32_t* q = (const int32_t*)&initialVal; + + v = TMIN(q[0], q[1]); + v = TMIN(v, q[2]); + v = TMIN(v, q[3]); + v = TMIN(v, q[4]); + v = TMIN(v, q[5]); + v = TMIN(v, q[6]); + v = TMIN(v, q[7]); + + // calculate the front and the remainder items in array list + startElem += rounds * bitWidth; + for (int32_t j = 0; j < remain; ++j) { + if (v > p[j + startElem]) { + v = p[j + startElem]; + } + } + } +#endif + + return v; +} + +static float floatVectorCmpAVX(const float* pData, int32_t numOfRows, bool isMinFunc) { + float v = 0; + +#if __AVX__ + int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1); + int32_t i = 0; + + int32_t bitWidth = 8; + + int32_t remain = (numOfRows - startElem) % bitWidth; + int32_t rounds = (numOfRows - startElem) / bitWidth; + const float* p = &pData[startElem]; + + __m256 next; + __m256 initialVal = _mm256_loadu_ps(p); + p += bitWidth; + + if (!isMinFunc) { // max function + for (; i < rounds; ++i) { + next = _mm256_loadu_ps(p); + initialVal = _mm256_max_ps(initialVal, next); + p += bitWidth; + } + + // let sum up the final results + const float* q = (const float*)&initialVal; + + v = TMAX(q[0], q[1]); + v = TMAX(v, q[2]); + v = TMAX(v, q[3]); + v = TMAX(v, q[4]); + v = TMAX(v, q[5]); + v = TMAX(v, q[6]); + v = TMAX(v, q[7]); + + // calculate the front and the reminder items in array list + startElem += rounds * bitWidth; + for (int32_t j = 0; j < remain; ++j) { + if (v < p[j + startElem]) { + v = p[j + startElem]; + } + } + } else { // min function + for (; i < rounds; ++i) { + next = _mm256_loadu_ps(p); + initialVal = _mm256_min_ps(initialVal, next); + p += bitWidth; + } + + // let sum up the final results + const float* q = (const float*)&initialVal; + + v = TMIN(q[0], q[1]); + v = TMIN(v, q[2]); + v = TMIN(v, q[3]); + v = TMIN(v, q[4]); + v = TMIN(v, q[5]); + v = TMIN(v, q[6]); + v = TMIN(v, q[7]); + + // calculate the front and the reminder items in array list + startElem += rounds * bitWidth; + for (int32_t j = 0; j < remain; ++j) { + if (v > p[j + startElem]) { + v = p[j + startElem]; + } + } + } +#endif + + return v; +} + static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { int32_t* pData = (int32_t*)pCol->pData; int32_t* val = (int32_t*)&pBuf->v; int32_t numOfElems = 0; - if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) { + if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { if (isMinFunc) { // min for (int32_t i = start; i < start + numOfRows; ++i) { if (colDataIsNull_f(pCol->nullbitmap, i)) { @@ -77,79 +220,30 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO } } } else { // not has null value - // 1. software version - - - - - // 3. AVX2 version to speedup the loop - int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1); - int32_t i = 0; - - int32_t bitWidth = 8; - int32_t v = 0; - - int32_t remain = (numOfRows - startElem) % bitWidth; - int32_t rounds = (numOfRows - startElem) / bitWidth; - const int32_t* p = &pData[startElem]; - - __m256i next; - __m256i initialVal = _mm256_loadu_si256((__m256i*)p); - p += bitWidth; - - if (!isMinFunc) { // max function - for (; i < rounds; ++i) { - next = _mm256_loadu_si256((__m256i*)p); - initialVal = _mm256_max_epi32(initialVal, next); - p += bitWidth; + // AVX2 version to speedup the loop + if (tsAVX2Enable && tsSIMDEnable) { + *val = i32VectorCmpAVX2(pData, numOfRows, isMinFunc); + } else { + if (!pBuf->assign) { + *val = pData[0]; + pBuf->assign = true; } - // let sum up the final results - const int32_t* q = (const int32_t*)&initialVal; - - v = TMAX(q[0], q[1]); - v = TMAX(v, q[2]); - v = TMAX(v, q[3]); - v = TMAX(v, q[4]); - v = TMAX(v, q[5]); - v = TMAX(v, q[6]); - v = TMAX(v, q[7]); - - // calculate the front and the reminder items in array list - startElem += rounds * bitWidth; - for (int32_t j = 0; j < remain; ++j) { - if (v < p[j + startElem]) { - v = p[j + startElem]; + if (isMinFunc) { // min + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val > pData[i]) { + *val = pData[i]; + } } - } - } else { // min function - for (; i < rounds; ++i) { - next = _mm256_loadu_si256((__m256i*)p); - initialVal = _mm256_min_epi32(initialVal, next); - p += bitWidth; - } - - // let sum up the final results - const int32_t* q = (const int32_t*)&initialVal; - - v = TMIN(q[0], q[1]); - v = TMIN(v, q[2]); - v = TMIN(v, q[3]); - v = TMIN(v, q[4]); - v = TMIN(v, q[5]); - v = TMIN(v, q[6]); - v = TMIN(v, q[7]); - - // calculate the front and the reminder items in array list - startElem += rounds * bitWidth; - for (int32_t j = 0; j < remain; ++j) { - if (v > p[j + startElem]) { - v = p[j + startElem]; + } else { // max + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val < pData[i]) { + *val = pData[i]; + } } } } - *val = v; numOfElems = numOfRows; } @@ -213,79 +307,30 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO } } } else { // not has null value - // 1. software version - - - - - // 3. AVX2 version to speedup the loop - int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1); - int32_t i = 0; - - int32_t bitWidth = 8; - float v = 0; - - int32_t remain = (numOfRows - startElem) % bitWidth; - int32_t rounds = (numOfRows - startElem) / bitWidth; - const float* p = &pData[startElem]; - - __m256 next; - __m256 initialVal = _mm256_loadu_ps(p); - p += bitWidth; - - if (!isMinFunc) { // max function - for (; i < rounds; ++i) { - next = _mm256_loadu_ps(p); - initialVal = _mm256_max_ps(initialVal, next); - p += bitWidth; + // AVX version to speedup the loop + if (tsAVXEnable && tsSIMDEnable) { + *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc); + } else { + if (!pBuf->assign) { + *val = pData[0]; + pBuf->assign = true; } - // let sum up the final results - const float* q = (const float*)&initialVal; - - v = TMAX(q[0], q[1]); - v = TMAX(v, q[2]); - v = TMAX(v, q[3]); - v = TMAX(v, q[4]); - v = TMAX(v, q[5]); - v = TMAX(v, q[6]); - v = TMAX(v, q[7]); - - // calculate the front and the reminder items in array list - startElem += rounds * bitWidth; - for (int32_t j = 0; j < remain; ++j) { - if (v < p[j + startElem]) { - v = p[j + startElem]; + if (isMinFunc) { // min + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val > pData[i]) { + *val = pData[i]; + } } - } - } else { // min function - for (; i < rounds; ++i) { - next = _mm256_loadu_ps(p); - initialVal = _mm256_min_ps(initialVal, next); - p += bitWidth; - } - - // let sum up the final results - const float* q = (const float*)&initialVal; - - v = TMIN(q[0], q[1]); - v = TMIN(v, q[2]); - v = TMIN(v, q[3]); - v = TMIN(v, q[4]); - v = TMIN(v, q[5]); - v = TMIN(v, q[6]); - v = TMIN(v, q[7]); - - // calculate the front and the reminder items in array list - startElem += rounds * bitWidth; - for (int32_t j = 0; j < remain; ++j) { - if (v > p[j + startElem]) { - v = p[j + startElem]; + } else { // max + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val < pData[i]) { + *val = pData[i]; + } } } } - *val = v; numOfElems = numOfRows; } diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c index ac1881fc6d843b3ad215ceb809d44e5811c53328..7063d1f5745b09097f76fe3f4fd12e3f3a693bec 100644 --- a/source/os/src/osEnv.c +++ b/source/os/src/osEnv.c @@ -37,6 +37,12 @@ float tsNumOfCores = 0; int64_t tsTotalMemoryKB = 0; char *tsProcPath = NULL; +char tsSIMDEnable = 0; +char tsSSE42Enable = 0; +char tsAVXEnable = 0; +char tsAVX2Enable = 0; +char tsFMAEnable = 0; + void osDefaultInit() { taosSeedRand(taosSafeRand()); taosGetSystemLocale(tsLocale, tsCharset); @@ -99,7 +105,7 @@ bool osDataSpaceSufficient() { return tsDataSpace.size.avail > tsDataSpace.reser bool osTempSpaceSufficient() { return tsTempSpace.size.avail > tsTempSpace.reserved; } -void osSetTimezone(const char *timezone) { taosSetSystemTimezone(timezone, tsTimezoneStr, &tsDaylight, &tsTimezone); } +void osSetTimezone(const char *tz) { taosSetSystemTimezone(tz, tsTimezoneStr, &tsDaylight, &tsTimezone); } void osSetSystemLocale(const char *inLocale, const char *inCharSet) { memcpy(tsLocale, inLocale, strlen(inLocale) + 1); diff --git a/source/os/src/osFile.c b/source/os/src/osFile.c index 94a10322ed2fbc82de8ed9d328534a8dff091c0c..9b42a7ea44e44dd5eac34ab080d89a84c73961c7 100644 --- a/source/os/src/osFile.c +++ b/source/os/src/osFile.c @@ -775,6 +775,7 @@ int64_t taosGetLineFile(TdFilePtr pFile, char **__restrict ptrBuf) { return getline(ptrBuf, &len, pFile->fp); #endif } + int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) { if (pFile == NULL || buf == NULL) { return -1; @@ -785,6 +786,7 @@ int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) { } return strlen(buf); } + int32_t taosEOFFile(TdFilePtr pFile) { if (pFile == NULL) { return 0; diff --git a/source/os/src/osLocale.c b/source/os/src/osLocale.c index 89216ecaf49e088272c3564377e35bbac4745e7b..7319181a777cb8140396f592009507a151d2620b 100644 --- a/source/os/src/osLocale.c +++ b/source/os/src/osLocale.c @@ -67,6 +67,9 @@ char *taosCharsetReplace(char *charsetstr) { } /** + * TODO: here we may employ the systemctl API to set/get the correct locale on the Linux. In some cases, the setlocale + * seems does not response as expected. + * * In some Linux systems, setLocale(LC_CTYPE, "") may return NULL, in which case the launch of * both the TDengine Server and the Client may be interrupted. * @@ -148,7 +151,7 @@ void taosGetSystemLocale(char *outLocale, char *outCharset) { * * example: en_US.UTF-8, zh_CN.GB18030, zh_CN.UTF-8, * - * if user does not specify the locale in taos.cfg the program use default LC_CTYPE as system locale. + * If user does not specify the locale in taos.cfg, the program then uses default LC_CTYPE as system locale. * * In case of some CentOS systems, their default locale is "en_US.utf8", which is not valid code_page * for libiconv that is employed to convert string in this system. This program will automatically use diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c index e5ca9faacbe31bfc7812e8880cc1c9a3cd27b298..51fff3a04f976f4e302d353e75779890cba4728b 100644 --- a/source/os/src/osSysinfo.c +++ b/source/os/src/osSysinfo.c @@ -155,8 +155,8 @@ static int32_t taosGetSysCpuInfo(SysCpuInfo *cpuInfo) { } char line[1024]; - ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line); - if ((_bytes < 0) || (line == NULL)) { + ssize_t bytes = taosGetsFile(pFile, sizeof(line), line); + if (bytes < 0) { taosCloseFile(&pFile); return -1; } @@ -193,9 +193,9 @@ static int32_t taosGetProcCpuInfo(ProcCpuInfo *cpuInfo) { return -1; } - char line[1024]; - ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line); - if ((_bytes < 0) || (line == NULL)) { + char line[1024] = {0}; + ssize_t bytes = taosGetsFile(pFile, sizeof(line), line); + if (bytes < 0) { taosCloseFile(&pFile); return -1; } @@ -239,6 +239,7 @@ void taosGetSystemInfo() { taosGetCpuCores(&tsNumOfCores); taosGetTotalMemory(&tsTotalMemoryKB); taosGetCpuUsage(NULL, NULL); + taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable); #endif } @@ -366,7 +367,7 @@ int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores) { return code; #else - char line[1024]; + char line[1024] = {0}; size_t size = 0; int32_t done = 0; int32_t code = -1; @@ -468,6 +469,46 @@ void taosGetCpuUsage(double *cpu_system, double *cpu_engine) { } } +#define __cpuid_fix(level, a, b, c, d) \ + __asm__("xor %%ecx, %%ecx\n" \ + "cpuid\n" \ + : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \ + : "0"(level)) + +// todo add for windows and mac +int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma) { +#ifdef WINDOWS +#elif defined(_TD_DARWIN_64) +#else + + // Since the compiler is not support avx/avx2 instructions, the global variables always need to be + // set to be false +#if __AVX__ || __AVX2__ + tsSIMDEnable = true; +#else + tsSIMDEnable = false; +#endif + + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + + int32_t ret = __get_cpuid(1, &eax, &ebx, &ecx, &edx); + if (ret == 0) { + return -1; // failed to get the cpuid info + } + + *sse42 = (char) ((ecx & bit_SSE4_2) == bit_SSE4_2); + *avx = (char) ((ecx & bit_AVX) == bit_AVX); + *fma = (char) ((ecx & bit_FMA) == bit_FMA); + + // work around a bug in GCC. + // Ref to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77756 + __cpuid_fix(7u, eax, ebx, ecx, edx); + *avx2 = (char) ((ebx & bit_AVX2) == bit_AVX2); + return 0; + +#endif +} + int32_t taosGetTotalMemory(int64_t *totalKB) { #ifdef WINDOWS MEMORYSTATUSEX memsStat; @@ -511,11 +552,11 @@ int32_t taosGetProcMemory(int64_t *usedKB) { return -1; } - ssize_t _bytes = 0; - char line[1024]; + ssize_t bytes = 0; + char line[1024] = {0}; while (!taosEOFFile(pFile)) { - _bytes = taosGetsFile(pFile, sizeof(line), line); - if ((_bytes < 0) || (line == NULL)) { + bytes = taosGetsFile(pFile, sizeof(line), line); + if (bytes < 0) { break; } if (strstr(line, "VmRSS:") != NULL) { @@ -523,7 +564,7 @@ int32_t taosGetProcMemory(int64_t *usedKB) { } } - if (line == NULL) { + if (strlen(line) < 0) { // printf("read file:%s failed", tsProcMemFile); taosCloseFile(&pFile); return -1; @@ -624,14 +665,14 @@ int32_t taosGetProcIO(int64_t *rchars, int64_t *wchars, int64_t *read_bytes, int TdFilePtr pFile = taosOpenFile(tsProcIOFile, TD_FILE_READ | TD_FILE_STREAM); if (pFile == NULL) return -1; - ssize_t _bytes = 0; - char line[1024]; + ssize_t bytes = 0; + char line[1024] = {0}; char tmp[24]; int readIndex = 0; while (!taosEOFFile(pFile)) { - _bytes = taosGetsFile(pFile, sizeof(line), line); - if (_bytes < 10 || line == NULL) { + bytes = taosGetsFile(pFile, sizeof(line), line); + if (bytes < 10) { break; } if (strstr(line, "rchar:") != NULL) { diff --git a/source/os/src/osTime.c b/source/os/src/osTime.c index 58a09565f94f5db1852bed39074624f69fcba5a7..2771c8064fa274c49202911ec2d5baf86b9452f9 100644 --- a/source/os/src/osTime.c +++ b/source/os/src/osTime.c @@ -339,7 +339,7 @@ char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm) { #endif } -FORCE_INLINE int32_t taosGetTimeOfDay(struct timeval *tv) { +int32_t taosGetTimeOfDay(struct timeval *tv) { #ifdef WINDOWS time_t t; t = taosGetTimestampSec(); @@ -455,6 +455,7 @@ static int isLeapYear(time_t year) { else return 1; } + struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst) { if (result == NULL) { return localtime(timep); @@ -542,7 +543,9 @@ struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst) #endif return result; } + int32_t taosGetTimestampSec() { return (int32_t)time(NULL); } + int32_t taosClockGetTime(int clock_id, struct timespec *pTS) { #ifdef WINDOWS LARGE_INTEGER t; diff --git a/source/util/src/tconfig.c b/source/util/src/tconfig.c index c1fee376103a15407e84d4ef988a01676884c468..9949d9e4f1214f90a904081a7c291ef01c6c3f1d 100644 --- a/source/util/src/tconfig.c +++ b/source/util/src/tconfig.c @@ -561,13 +561,13 @@ void cfgDumpCfg(SConfig *pCfg, bool tsc, bool dump) { if (dump && strcmp(pItem->name, "scriptDir") == 0) continue; if (dump && strcmp(pItem->name, "simDebugFlag") == 0) continue; tstrncpy(src, cfgStypeStr(pItem->stype), CFG_SRC_PRINT_LEN); - for (int32_t i = 0; i < CFG_SRC_PRINT_LEN; ++i) { - if (src[i] == 0) src[i] = ' '; + for (int32_t j = 0; j < CFG_SRC_PRINT_LEN; ++j) { + if (src[j] == 0) src[j] = ' '; } tstrncpy(name, pItem->name, CFG_NAME_PRINT_LEN); - for (int32_t i = 0; i < CFG_NAME_PRINT_LEN; ++i) { - if (name[i] == 0) name[i] = ' '; + for (int32_t j = 0; j < CFG_NAME_PRINT_LEN; ++j) { + if (name[j] == 0) name[j] = ' '; } switch (pItem->dtype) { diff --git a/source/util/src/tcrc32c.c b/source/util/src/tcrc32c.c index bd662fa02cfd4d3e7c512f6e696b74b5d7e7c543..795fe9dc4fca4a53b9eed0039f07ecf8dcaf731b 100644 --- a/source/util/src/tcrc32c.c +++ b/source/util/src/tcrc32c.c @@ -24,7 +24,6 @@ #endif #include "tcrc32c.h" -#include "tdef.h" #define POLY 0x82f63b78 #define LONG_SHIFT 8192