提交 8b4b1534 编写于 作者: P proller 提交者: alexey-milovidov

Cmake: Test on having sse (#405)

* Cmake: Test on having sse

* fix

* wip

* wip

* wip

* wip

* wip

* wip

* fix

* fix

* fix

* fix

* fix

* fix
上级 cab5d44d
......@@ -46,10 +46,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set (AARCH64 1)
endif ()
if (NOT AARCH64)
set (MACHINE_FLAGS "-msse4 -mpopcnt")
endif ()
set (COMMON_WARNING_FLAGS "-Wall") # -Werror is also added inside directories with our own code.
set (CXX_WARNING_FLAGS "-Wnon-virtual-dtor")
......@@ -110,14 +106,21 @@ if (PIPE)
set (COMPILER_FLAGS "${COMPILER_FLAGS} -pipe")
endif ()
include (cmake/test_cpu.cmake)
option (ARCHNATIVE "Enable -march=native compiler flag" OFF)
if (ARCHNATIVE)
set (COMPILER_FLAGS "${COMPILER_FLAGS} -march=native")
endif ()
set (CMAKE_BUILD_COLOR_MAKEFILE ON)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} -std=gnu++1y ${PLATFORM_EXTRA_CXX_FLAG} -fno-omit-frame-pointer ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS} ${MACHINE_FLAGS} ${GLIBC_COMPATIBILITY_COMPILE_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} -std=gnu++1y ${PLATFORM_EXTRA_CXX_FLAG} -fno-omit-frame-pointer ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS} ${GLIBC_COMPATIBILITY_COMPILE_FLAGS}")
#set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb3 -fno-inline")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} -fno-omit-frame-pointer ${COMMON_WARNING_FLAGS} ${MACHINE_FLAGS} ${GLIBC_COMPATIBILITY_COMPILE_FLAGS}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} -fno-omit-frame-pointer ${COMMON_WARNING_FLAGS} ${GLIBC_COMPATIBILITY_COMPILE_FLAGS}")
#set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g3 -ggdb3 -fno-inline")
......
......@@ -62,7 +62,3 @@ check_cxx_source_runs("
append_history(1,nullptr);
}
" HAVE_READLINE_HISTORY)
#if (HAVE_READLINE_HISTORY)
# add_definitions (-D HAVE_READLINE_HISTORY)
#endif ()
# https://software.intel.com/sites/landingpage/IntrinsicsGuide/
include (CheckCXXSourceCompiles)
# gcc -dM -E -mno-sse2 - < /dev/null | sort > gcc-dump-nosse2
# gcc -dM -E -msse2 - < /dev/null | sort > gcc-dump-sse2
#define __SSE2__ 1
#define __SSE2_MATH__ 1
# gcc -dM -E -msse4.1 - < /dev/null | sort > gcc-dump-sse41
#define __SSE4_1__ 1
set (TEST_FLAG "-msse4.1")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG}")
check_cxx_source_compiles("
#include <smmintrin.h>
int main() {
_mm_insert_epi8(__m128i(), 0, 0);
return 0;
}
" HAVE_SSE41)
if (HAVE_SSE41)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
# gcc -dM -E -msse4.2 - < /dev/null | sort > gcc-dump-sse42
#define __SSE4_2__ 1
set (TEST_FLAG "-msse4.2")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG}")
check_cxx_source_compiles("
#include <nmmintrin.h>
int main() {
_mm_crc32_u64(0, 0);
return 0;
}
" HAVE_SSE42)
if (HAVE_SSE42)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
# gcc -dM -E -mpopcnt - < /dev/null | sort > gcc-dump-popcnt
#define __POPCNT__ 1
set (TEST_FLAG "-mpopcnt")
set (CMAKE_REQUIRED_FLAGS "${TEST_FLAG}")
check_cxx_source_compiles("
int main() {
__builtin_popcountll(0);
return 0;
}
" HAVE_POPCNT)
if (HAVE_POPCNT AND NOT AARCH64)
set (COMPILER_FLAGS "${COMPILER_FLAGS} ${TEST_FLAG}")
endif ()
set (CMAKE_REQUIRED_FLAGS "")
# TODO: add here sse3 test if you want use it
......@@ -12,7 +12,7 @@
#include <DB/Columns/IColumn.h>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -357,7 +357,7 @@ public:
const UInt8 * filt_end = filt_pos + size;
const T * data_pos = &data[0];
#if defined(__x86_64__)
#if __SSE2__
/** Чуть более оптимизированная версия.
* Исходит из допущения, что часто куски последовательно идущих значений
* полностью проходят или полностью не проходят фильтр.
......
......@@ -5,7 +5,7 @@
#include <type_traits>
#if defined(__x86_64__)
#if __SSE2__
#define LIBDIVIDE_USE_SSE2 1
#endif
......
......@@ -7,7 +7,7 @@
#include <stdint.h>
#include <string.h>
#if defined(__x86_64__)
#if __SSE4_1__
#include <smmintrin.h>
#endif
......@@ -60,7 +60,7 @@ private:
UInt8 l{};
UInt8 u{};
#if defined(__x86_64__)
#if __SSE4_1__
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
__m128i patl, patu;
/// lower and uppercase vectors of first 16 characters of `needle`
......@@ -99,7 +99,7 @@ public:
u = u_seq[0];
}
#if defined(__x86_64__)
#if __SSE4_1__
/// for detecting leftmost position of the first symbol
patl = _mm_set1_epi8(l);
patu = _mm_set1_epi8(u);
......@@ -160,7 +160,7 @@ public:
{
static const Poco::UTF8Encoding utf8;
#if defined(__x86_64__)
#if __SSE4_1__
if (page_safe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
......@@ -225,7 +225,7 @@ public:
while (haystack < haystack_end)
{
#if defined(__x86_64__)
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
......@@ -330,7 +330,7 @@ private:
UInt8 l{};
UInt8 u{};
#if defined(__x86_64__)
#if __SSE4_1__
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
__m128i patl, patu;
/// lower and uppercase vectors of first 16 characters of `needle`
......@@ -348,7 +348,7 @@ public:
l = static_cast<UInt8>(std::tolower(*needle));
u = static_cast<UInt8>(std::toupper(*needle));
#if defined(__x86_64__)
#if __SSE4_1__
patl = _mm_set1_epi8(l);
patu = _mm_set1_epi8(u);
......@@ -372,7 +372,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if defined(__x86_64__)
#if __SSE4_1__
if (page_safe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
......@@ -424,7 +424,7 @@ public:
while (haystack < haystack_end)
{
#if defined(__x86_64__)
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
......@@ -516,7 +516,7 @@ private:
/// first character in `needle`
UInt8 first{};
#if defined(__x86_64__)
#if __SSE4_1__
/// vector filled `first` for determining leftmost position of the first symbol
__m128i pattern;
/// vector of first 16 characters of `needle`
......@@ -533,7 +533,7 @@ public:
first = *needle;
#if defined(__x86_64__)
#if __SSE4_1__
pattern = _mm_set1_epi8(first);
auto needle_pos = needle;
......@@ -554,7 +554,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if defined(__x86_64__)
#if __SSE4_1__
if (page_safe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
......@@ -604,7 +604,7 @@ public:
while (haystack < haystack_end)
{
#if defined(__x86_64__)
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
{
/// find first character
......
......@@ -3,9 +3,7 @@
#include <string.h>
#include <DB/Core/Defines.h>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
......
......@@ -5,7 +5,7 @@
#include <functional>
#include <ostream>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -35,7 +35,7 @@ using StringRefs = std::vector<StringRef>;
using UInt64 = DB::UInt64;
#if defined(__x86_64__)
#if __SSE2__
/** Сравнение строк на равенство.
* Подход является спорным и выигрывает не во всех случаях.
......@@ -128,7 +128,7 @@ inline bool operator== (StringRef lhs, StringRef rhs)
if (lhs.size == 0)
return true;
#if defined(__x86_64__)
#if __SSE2__
return memequalSSE2Wide(lhs.data, rhs.data, lhs.size);
#else
return 0 == memcmp(lhs.data, rhs.data, lhs.size);
......@@ -169,7 +169,7 @@ struct StringRefHash64
}
};
#if defined(__x86_64__)
#if __SSE4_2__
#ifdef __SSE4_1__
#include <smmintrin.h>
......
......@@ -943,7 +943,7 @@ template <> struct FunctionUnaryArithmeticMonotonicity<NameBitNot>
/// Оптимизации для целочисленного деления на константу.
#if defined(__x86_64__)
#if __SSE2__
#define LIBDIVIDE_USE_SSE2 1
#endif
......@@ -981,7 +981,7 @@ struct DivideIntegralByConstantImpl
const A * a_end = a_pos + size;
ResultType * c_pos = &c[0];
#if defined(__x86_64__)
#if __SSE2__
static constexpr size_t values_per_sse_register = 16 / sizeof(A);
const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register;
......
......@@ -5,7 +5,7 @@
#include <type_traits>
#include <array>
#if defined(__x86_64__)
#if __SSE4_1__
#include <smmintrin.h>
#endif
......@@ -268,7 +268,7 @@ namespace DB
}
};
#if defined(__x86_64__)
#if __SSE4_1__
template <typename T>
class BaseFloatRoundingComputation;
......
......@@ -16,9 +16,8 @@
#include <DB/Functions/IFunction.h>
#include <ext/range.hpp>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#include <nmmintrin.h>
#endif
......@@ -232,7 +231,7 @@ private:
{
const auto flip_case_mask = 'A' ^ 'a';
#if defined(__x86_64__)
#if __SSE2__
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
......@@ -393,7 +392,7 @@ private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
#if defined(__x86_64__)
#if __SSE2__
const auto bytes_sse = sizeof(__m128i);
auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
......
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -20,7 +20,7 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
const Int8 * pos = reinterpret_cast<const Int8 *>(&filt[0]);
const Int8 * end = pos + filt.size();
#if defined(__x86_64__)
#if __SSE2__ && __POPCNT__
const __m128i zero16 = _mm_setzero_si128();
const Int8 * end64 = pos + filt.size() / 64 * 64;
......@@ -95,7 +95,7 @@ void filterArraysImpl(
memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T));
};
#if defined(__x86_64__)
#if __SSE2__
const __m128i zero_vec = _mm_setzero_si128();
static constexpr size_t SIMD_BYTES = 16;
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
......
......@@ -10,7 +10,7 @@
#include <DB/IO/WriteHelpers.h>
#include <DB/IO/VarInt.h>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -125,7 +125,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column
if (size)
{
#if defined(__x86_64__)
#if __SSE2__
/// Оптимистичная ветка, в которой возможно более эффективное копирование.
if (offset + 16 * UNROLL_TIMES <= data.allocated_size() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end())
{
......
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <sstream>
#include <mysqlxx/Manip.h>
......
......@@ -2,8 +2,8 @@
#include <DB/IO/WriteBufferValidUTF8.h>
#include <DB/Core/Types.h>
#ifdef __x86_64__
#include <emmintrin.h>
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -69,7 +69,7 @@ void WriteBufferValidUTF8::nextImpl()
while (p < pos)
{
#ifdef __x86_64__
#if __SSE2__
/// Fast skip of ASCII
static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
......
#if defined(__x86_64__)
#include <nmmintrin.h>
#endif
#include <string>
#include <iostream>
#include <iomanip>
......
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <iostream>
#include <iomanip>
......@@ -11,22 +7,6 @@
#include <DB/Common/Stopwatch.h>
#if defined(__x86_64__)
std::ostream & operator<< (std::ostream & ostr, const __m128i vec)
{
char digits[16];
_mm_store_si128(reinterpret_cast<__m128i *>(digits), vec);
ostr << "{";
for (size_t i = 0; i < 16; ++i)
ostr << (i ? ", " : "") << static_cast<int>(digits[i]);
ostr << "}";
return ostr;
}
#endif
namespace test
{
template <typename T>
......
......@@ -15,7 +15,7 @@
#include <DB/Common/HashTable/HashMap.h>
#include <DB/Interpreters/AggregationCommon.h>
#if defined(__x86_64__)
#if __SSE4_1__
#include <smmintrin.h>
#endif
......@@ -77,7 +77,7 @@ DefineStringRef(StringRef_Compare16_1_byMemcmp)
DefineStringRef(StringRef_Compare16_1_byUInt64_logicAnd)
DefineStringRef(StringRef_Compare16_1_byUInt64_bitAnd)
#if defined(__x86_64__)
#if __SSE4_1__
DefineStringRef(StringRef_Compare16_1_byIntSSE)
DefineStringRef(StringRef_Compare16_1_byFloatSSE)
DefineStringRef(StringRef_Compare16_1_bySSE4)
......@@ -196,7 +196,7 @@ inline bool compare_byUInt64_bitAnd(const char * p1, const char * p2)
& (reinterpret_cast<const UInt64 *>(p1)[1] == reinterpret_cast<const UInt64 *>(p2)[1]);
}
#if defined(__x86_64__)
#if __SSE4_1__
inline bool compare_byIntSSE(const char * p1, const char * p2)
{
......@@ -263,7 +263,7 @@ inline bool memequal(const char * p1, const char * p2, size_t size)
}
#if defined(__x86_64__)
#if __SSE4_1__
inline bool memequal_sse41(const char * p1, const char * p2, size_t size)
{
......@@ -514,7 +514,7 @@ Op(byMemcmp)
Op(byUInt64_logicAnd)
Op(byUInt64_bitAnd)
#if defined(__x86_64__)
#if __SSE4_1__
Op(byIntSSE)
Op(byFloatSSE)
......@@ -642,7 +642,7 @@ int main(int argc, char ** argv)
if (!m || m == 5) bench<StringRef_Compare16_1_byMemcmp> (data, "StringRef_Compare16_1_byMemcmp");
if (!m || m == 6) bench<StringRef_Compare16_1_byUInt64_logicAnd>(data, "StringRef_Compare16_1_byUInt64_logicAnd");
if (!m || m == 7) bench<StringRef_Compare16_1_byUInt64_bitAnd> (data, "StringRef_Compare16_1_byUInt64_bitAnd");
#if defined(__x86_64__)
#if __SSE4_1__
if (!m || m == 8) bench<StringRef_Compare16_1_byIntSSE> (data, "StringRef_Compare16_1_byIntSSE");
if (!m || m == 9) bench<StringRef_Compare16_1_byFloatSSE> (data, "StringRef_Compare16_1_byFloatSSE");
if (!m || m == 10) bench<StringRef_Compare16_1_bySSE4> (data, "StringRef_Compare16_1_bySSE4");
......
......@@ -18,7 +18,7 @@
#include <DB/Common/HashTable/HashMap.h>
#include <DB/Interpreters/AggregationCommon.h>
#if defined(__x86_64__)
#if __SSE4_1__
#include <smmintrin.h>
#endif
......@@ -139,7 +139,7 @@ struct FastHash64
};
#if defined(__x86_64__)
#if __SSE4_1__
struct CrapWow
{
......@@ -229,7 +229,7 @@ struct SimpleHash
if (size < 8)
{
#if defined(__x86_64__)
#if __SSE4_1__
return hashLessThan8(x.data, x.size);
#endif
}
......@@ -266,7 +266,7 @@ struct VerySimpleHash
if (size < 8)
{
#if defined(__x86_64__)
#if __SSE4_1__
return hashLessThan8(x.data, x.size);
#endif
}
......@@ -316,7 +316,7 @@ struct MetroHash64
};
#if defined(__x86_64__)
#if __SSE4_1__
/*struct CRC32Hash
{
......@@ -466,7 +466,7 @@ int main(int argc, char ** argv)
if (!m || m == 2) bench<StringRef_CompareMemcmp, FastHash64> (data, "StringRef_FastHash64");
if (!m || m == 3) bench<StringRef_CompareMemcmp, SimpleHash> (data, "StringRef_SimpleHash");
#if defined(__x86_64__)
#if __SSE4_1__
if (!m || m == 4) bench<StringRef_CompareMemcmp, CrapWow> (data, "StringRef_CrapWow");
if (!m || m == 5) bench<StringRef_CompareMemcmp, CRC32Hash> (data, "StringRef_CRC32Hash");
if (!m || m == 6) bench<StringRef_CompareMemcmp, CRC32ILPHash> (data, "StringRef_CRC32ILPHash");
......
......@@ -64,7 +64,7 @@ inline bool operator==(SmallStringRef lhs, SmallStringRef rhs)
if (lhs.size == 0)
return true;
#if __x86_64__
#if __SSE2__
return memequalSSE2Wide(lhs.data(), rhs.data(), lhs.size);
#else
return false;
......
#pragma once
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
#if __SSE4_2__
#include <nmmintrin.h>
#endif
......@@ -35,7 +38,7 @@ inline bool is_in(char x)
return x == s0 || is_in<s1, tail...>(x);
}
#if defined(__x86_64__)
#if __SSE2__
template <char s0>
inline __m128i mm_is_in(__m128i bytes)
{
......@@ -56,7 +59,7 @@ inline __m128i mm_is_in(__m128i bytes)
template <char... symbols>
inline const char * find_first_symbols_sse2(const char * begin, const char * end)
{
#if defined(__x86_64__)
#if __SSE2__
for (; begin + 15 < end; begin += 16)
{
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(begin));
......@@ -83,7 +86,7 @@ template <size_t num_chars,
char c13 = 0, char c14 = 0, char c15 = 0, char c16 = 0>
inline const char * find_first_symbols_sse42_impl(const char * begin, const char * end)
{
#if defined(__x86_64__)
#if __SSE4_2__
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT)
__m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c16);
......@@ -131,8 +134,10 @@ inline const char * find_first_symbols_sse42(const char * begin, const char * en
template <char... symbols>
inline const char * find_first_symbols(const char * begin, const char * end)
{
#if __SSE4_2__
if (sizeof...(symbols) >= 5)
return detail::find_first_symbols_sse42<symbols...>(begin, end);
else
#endif
return detail::find_first_symbols_sse2<symbols...>(begin, end);
}
#include <iostream>
#include <iomanip>
#if defined(__x86_64__)
#if __SSE2__
#include <emmintrin.h>
#endif
......@@ -15,7 +15,7 @@
int main(int argc, char ** argv)
{
#if defined(__x86_64__)
#if __SSE2__
try
{
DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册