diff --git a/CMakeLists.txt b/CMakeLists.txt index 5080447121f3d4d7af8147b26567f3d4ed31a74d..e41068f579484e6cd90a4da847a1f201ae08a9c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,6 +132,10 @@ elseif(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64(le)?") else() set(NEEDS_SIMDE "1") add_definitions(-DNEEDS_SIMDE=1) + if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp-simd -DSIMDE_ENABLE_OPENMP") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp-simd -DSIMDE_ENABLE_OPENMP") + endif() set(ARCH_SIMD_FLAGS "") message(STATUS "No Native SSE2 SIMD Support - Using SIMDE") endif() diff --git a/libobs/util/simde/README.libobs b/libobs/util/simde/README.libobs new file mode 100644 index 0000000000000000000000000000000000000000..20ea622def469141367ba42663ce3f85b18216be --- /dev/null +++ b/libobs/util/simde/README.libobs @@ -0,0 +1,5 @@ +This is a slightly modified version of https://github.com/nemequ/simde/commit/cafec4b952fa5a31a51a10326f97c2e7c9067771 +sse{,2}.h and mmx.h was moved down from the original "x86" subdirectory, +subsequently the '#include "../simde-common.h"' line in mmx.h was changed to '#include "simde-common.h"' + +Then the code was reformatted using the "formatcode.sh" script in the root of this repository. diff --git a/libobs/util/simde/check.h b/libobs/util/simde/check.h index 2ad107ebf9585dfdb1fafa456cd474d8aa5fd3dc..b5015fd90131e98d89075d0fd7728cbea68b88e2 100644 --- a/libobs/util/simde/check.h +++ b/libobs/util/simde/check.h @@ -6,6 +6,8 @@ * copyright and related or neighboring rights to this code. For * details, see the Creative Commons Zero 1.0 Universal license at * https://creativecommons.org/publicdomain/zero/1.0/ + * + * SPDX-License-Identifier: CC0-1.0 */ #if !defined(SIMDE_CHECK_H) @@ -15,6 +17,7 @@ #define SIMDE_NDEBUG 1 #endif +#include "hedley.h" #include #if !defined(_WIN32) @@ -32,24 +35,47 @@ #endif #if defined(_MSC_VER) && (_MSC_VER >= 1500) -#define SIMDE__PUSH_DISABLE_MSVC_C4127 \ +#define SIMDE_PUSH_DISABLE_MSVC_C4127_ \ __pragma(warning(push)) __pragma(warning(disable : 4127)) -#define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) +#define SIMDE_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop)) #else -#define SIMDE__PUSH_DISABLE_MSVC_C4127 -#define SIMDE__POP_DISABLE_MSVC_C4127 +#define SIMDE_PUSH_DISABLE_MSVC_C4127_ +#define SIMDE_POP_DISABLE_MSVC_C4127_ #endif #if !defined(simde_errorf) +#if defined(__has_include) +#if __has_include() +#include +#endif +#elif defined(SIMDE_STDC_HOSTED) +#if SIMDE_STDC_HOSTED == 1 #include -#include +#endif +#elif defined(__STDC_HOSTED__) +#if __STDC_HOSTETD__ == 1 +#include +#endif +#endif + +#include "debug-trap.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ +#if defined(EOF) #define simde_errorf(format, ...) \ (fprintf(stderr, format, __VA_ARGS__), abort()) +#else +#define simde_errorf(format, ...) (simde_trap()) +#endif +HEDLEY_DIAGNOSTIC_POP #endif #define simde_error(msg) simde_errorf("%s", msg) -#if defined(SIMDE_NDEBUG) +#if defined(SIMDE_NDEBUG) || \ + (defined(__cplusplus) && (__cplusplus < 201103L)) || \ + (defined(__STDC__) && (__STDC__ < 199901L)) #if defined(SIMDE_CHECK_FAIL_DEFINED) #define simde_assert(expr) #else @@ -78,8 +104,8 @@ if (!HEDLEY_LIKELY(expr)) { \ simde_error("assertion failed: " #expr "\n"); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_true(expr) \ do { \ @@ -87,8 +113,8 @@ simde_error("assertion failed: " #expr \ " is not true\n"); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_false(expr) \ do { \ @@ -96,8 +122,8 @@ simde_error("assertion failed: " #expr \ " is not false\n"); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ do { \ @@ -110,8 +136,8 @@ #a, #op, #b, simde_tmp_a_, #op, \ simde_tmp_b_); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_double_equal(a, b, precision) \ do { \ @@ -127,8 +153,8 @@ "g == %0." #precision "g)\n", \ #a, #b, simde_tmp_a_, simde_tmp_b_); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #include #define simde_assert_string_equal(a, b) \ @@ -141,8 +167,8 @@ "assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \ #a, #b, simde_tmp_a_, simde_tmp_b_); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_string_not_equal(a, b) \ do { \ @@ -154,8 +180,8 @@ "assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \ #a, #b, simde_tmp_a_, simde_tmp_b_); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_memory_equal(size, a, b) \ do { \ @@ -180,8 +206,8 @@ } \ } \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #define simde_assert_memory_not_equal(size, a, b) \ do { \ @@ -197,8 +223,8 @@ "u bytes)\n", \ #a, #b, simde_tmp_size_); \ } \ - SIMDE__PUSH_DISABLE_MSVC_C4127 \ - } while (0) SIMDE__POP_DISABLE_MSVC_C4127 + SIMDE_PUSH_DISABLE_MSVC_C4127_ \ + } while (0) SIMDE_POP_DISABLE_MSVC_C4127_ #endif #define simde_assert_type(T, fmt, a, op, b) \ diff --git a/libobs/util/simde/debug-trap.h b/libobs/util/simde/debug-trap.h new file mode 100644 index 0000000000000000000000000000000000000000..be901fafa18c2098ae71c5b906eba4c5ab7f1dc0 --- /dev/null +++ b/libobs/util/simde/debug-trap.h @@ -0,0 +1,117 @@ +/* Debugging assertions and traps + * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * https://creativecommons.org/publicdomain/zero/1.0/ + * + * SPDX-License-Identifier: CC0-1.0 + */ + +#if !defined(SIMDE_DEBUG_TRAP_H) +#define SIMDE_DEBUG_TRAP_H + +#if !defined(SIMDE_NDEBUG) && defined(NDEBUG) && !defined(SIMDE_DEBUG) +#define SIMDE_NDEBUG 1 +#endif + +#if defined(__has_builtin) && !defined(__ibmxl__) +#if __has_builtin(__builtin_debugtrap) +#define simde_trap() __builtin_debugtrap() +#elif __has_builtin(__debugbreak) +#define simde_trap() __debugbreak() +#endif +#endif +#if !defined(simde_trap) +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#define simde_trap() __debugbreak() +#elif defined(__ARMCC_VERSION) +#define simde_trap() __breakpoint(42) +#elif defined(__ibmxl__) || defined(__xlC__) +#include +#define simde_trap() __trap(42) +#elif defined(__DMC__) && defined(_M_IX86) +static inline void simde_trap(void) +{ + __asm int 3h; +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void simde_trap(void) +{ + __asm__ __volatile__("int $03"); +} +#elif defined(__thumb__) +static inline void simde_trap(void) +{ + __asm__ __volatile__(".inst 0xde01"); +} +#elif defined(__aarch64__) +static inline void simde_trap(void) +{ + __asm__ __volatile__(".inst 0xd4200000"); +} +#elif defined(__arm__) +static inline void simde_trap(void) +{ + __asm__ __volatile__(".inst 0xe7f001f0"); +} +#elif defined(__alpha__) && !defined(__osf__) +static inline void simde_trap(void) +{ + __asm__ __volatile__("bpt"); +} +#elif defined(_54_) +static inline void simde_trap(void) +{ + __asm__ __volatile__("ESTOP"); +} +#elif defined(_55_) +static inline void simde_trap(void) +{ + __asm__ __volatile__( + ";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP"); +} +#elif defined(_64P_) +static inline void simde_trap(void) +{ + __asm__ __volatile__("SWBP 0"); +} +#elif defined(_6x_) +static inline void simde_trap(void) +{ + __asm__ __volatile__("NOP\n .word 0x10000000"); +} +#elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__) +#define simde_trap() __builtin_trap() +#else +#include +#if defined(SIGTRAP) +#define simde_trap() raise(SIGTRAP) +#else +#define simde_trap() raise(SIGABRT) +#endif +#endif +#endif + +#if defined(HEDLEY_LIKELY) +#define SIMDE_DBG_LIKELY(expr) HEDLEY_LIKELY(expr) +#elif defined(__GNUC__) && (__GNUC__ >= 3) +#define SIMDE_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1) +#else +#define SIMDE_DBG_LIKELY(expr) (!!(expr)) +#endif + +#if !defined(SIMDE_NDEBUG) || (SIMDE_NDEBUG == 0) +#define simde_dbg_assert(expr) \ + do { \ + if (!SIMDE_DBG_LIKELY(expr)) { \ + simde_trap(); \ + } \ + } while (0) +#else +#define simde_dbg_assert(expr) +#endif + +#endif /* !defined(SIMDE_DEBUG_TRAP_H) */ diff --git a/libobs/util/simde/hedley.h b/libobs/util/simde/hedley.h index 3c9cc49de651e31e7ed98840cc078600f3cfa297..3aae5dd8ea9c58b1f497520b717d5161bc5c6166 100644 --- a/libobs/util/simde/hedley.h +++ b/libobs/util/simde/hedley.h @@ -10,11 +10,11 @@ * SPDX-License-Identifier: CC0-1.0 */ -#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 10) +#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 12) #if defined(HEDLEY_VERSION) #undef HEDLEY_VERSION #endif -#define HEDLEY_VERSION 10 +#define HEDLEY_VERSION 12 #if defined(HEDLEY_STRINGIFY_EX) #undef HEDLEY_STRINGIFY_EX @@ -263,12 +263,16 @@ #if defined(HEDLEY_TI_VERSION) #undef HEDLEY_TI_VERSION #endif -#if defined(__TI_COMPILER_VERSION__) +#if defined(__TI_COMPILER_VERSION__) && \ + (defined(__TMS470__) || defined(__TI_ARM__) || defined(__MSP430__) || \ + defined(__TMS320C2000__)) +#if (__TI_COMPILER_VERSION__ >= 16000000) #define HEDLEY_TI_VERSION \ HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ (__TI_COMPILER_VERSION__ % 1000)) #endif +#endif #if defined(HEDLEY_TI_VERSION_CHECK) #undef HEDLEY_TI_VERSION_CHECK @@ -280,6 +284,127 @@ #define HEDLEY_TI_VERSION_CHECK(major, minor, patch) (0) #endif +#if defined(HEDLEY_TI_CL2000_VERSION) +#undef HEDLEY_TI_CL2000_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) +#define HEDLEY_TI_CL2000_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL2000_VERSION_CHECK) +#undef HEDLEY_TI_CL2000_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL2000_VERSION) +#define HEDLEY_TI_CL2000_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_CL2000_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION) +#undef HEDLEY_TI_CL430_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) +#define HEDLEY_TI_CL430_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL430_VERSION_CHECK) +#undef HEDLEY_TI_CL430_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL430_VERSION) +#define HEDLEY_TI_CL430_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_CL430_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION) +#undef HEDLEY_TI_ARMCL_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && \ + (defined(__TMS470__) || defined(__TI_ARM__)) +#define HEDLEY_TI_ARMCL_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK) +#undef HEDLEY_TI_ARMCL_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_ARMCL_VERSION) +#define HEDLEY_TI_ARMCL_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_ARMCL_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION) +#undef HEDLEY_TI_CL6X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) +#define HEDLEY_TI_CL6X_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL6X_VERSION_CHECK) +#undef HEDLEY_TI_CL6X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL6X_VERSION) +#define HEDLEY_TI_CL6X_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_CL6X_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION) +#undef HEDLEY_TI_CL7X_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) +#define HEDLEY_TI_CL7X_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CL7X_VERSION_CHECK) +#undef HEDLEY_TI_CL7X_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CL7X_VERSION) +#define HEDLEY_TI_CL7X_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_CL7X_VERSION_CHECK(major, minor, patch) (0) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION) +#undef HEDLEY_TI_CLPRU_VERSION +#endif +#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) +#define HEDLEY_TI_CLPRU_VERSION \ + HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, \ + (__TI_COMPILER_VERSION__ % 1000000) / 1000, \ + (__TI_COMPILER_VERSION__ % 1000)) +#endif + +#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK) +#undef HEDLEY_TI_CLPRU_VERSION_CHECK +#endif +#if defined(HEDLEY_TI_CLPRU_VERSION) +#define HEDLEY_TI_CLPRU_VERSION_CHECK(major, minor, patch) \ + (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) +#else +#define HEDLEY_TI_CLPRU_VERSION_CHECK(major, minor, patch) (0) +#endif + #if defined(HEDLEY_CRAY_VERSION) #undef HEDLEY_CRAY_VERSION #endif @@ -408,7 +533,12 @@ #if defined(HEDLEY_GNUC_VERSION) && !defined(__clang__) && \ !defined(HEDLEY_INTEL_VERSION) && !defined(HEDLEY_PGI_VERSION) && \ !defined(HEDLEY_ARM_VERSION) && !defined(HEDLEY_TI_VERSION) && \ - !defined(__COMPCERT__) + !defined(HEDLEY_TI_ARMCL_VERSION) && \ + !defined(HEDLEY_TI_CL430_VERSION) && \ + !defined(HEDLEY_TI_CL2000_VERSION) && \ + !defined(HEDLEY_TI_CL6X_VERSION) && \ + !defined(HEDLEY_TI_CL7X_VERSION) && \ + !defined(HEDLEY_TI_CLPRU_VERSION) && !defined(__COMPCERT__) #define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION #endif @@ -456,12 +586,30 @@ #if defined(HEDLEY_HAS_CPP_ATTRIBUTE) #undef HEDLEY_HAS_CPP_ATTRIBUTE #endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) +#if defined(__has_cpp_attribute) && defined(__cplusplus) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) #define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) #else #define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) #endif +#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS) +#undef HEDLEY_HAS_CPP_ATTRIBUTE_NS +#endif +#if !defined(__cplusplus) || !defined(__has_cpp_attribute) +#define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) (0) +#elif !defined(HEDLEY_PGI_VERSION) && !defined(HEDLEY_IAR_VERSION) && \ + (!defined(HEDLEY_SUNPRO_VERSION) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) && \ + (!defined(HEDLEY_MSVC_VERSION) || \ + HEDLEY_MSVC_VERSION_CHECK(19, 20, 0)) +#define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) \ + HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) +#else +#define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) (0) +#endif + #if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) #undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE #endif @@ -640,13 +788,99 @@ HEDLEY_GCC_VERSION_CHECK(major, minor, patch) #endif +/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +#undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ +#endif +#if defined(__cplusplus) +#if HEDLEY_HAS_WARNING("-Wc++98-compat") +#if HEDLEY_HAS_WARNING("-Wc++17-extensions") +#define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ + xpr HEDLEY_DIAGNOSTIC_POP +#else +#define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ + xpr HEDLEY_DIAGNOSTIC_POP +#endif +#endif +#endif +#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) +#define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x +#endif + +#if defined(HEDLEY_CONST_CAST) +#undef HEDLEY_CONST_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) +#elif HEDLEY_HAS_WARNING("-Wcast-qual") || \ + HEDLEY_GCC_VERSION_CHECK(4, 6, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define HEDLEY_CONST_CAST(T, expr) \ + (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL((T)(expr)); \ + HEDLEY_DIAGNOSTIC_POP \ + })) +#else +#define HEDLEY_CONST_CAST(T, expr) ((T)(expr)) +#endif + +#if defined(HEDLEY_REINTERPRET_CAST) +#undef HEDLEY_REINTERPRET_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#else +#define HEDLEY_REINTERPRET_CAST(T, expr) ((T)(expr)) +#endif + +#if defined(HEDLEY_STATIC_CAST) +#undef HEDLEY_STATIC_CAST +#endif +#if defined(__cplusplus) +#define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#else +#define HEDLEY_STATIC_CAST(T, expr) ((T)(expr)) +#endif + +#if defined(HEDLEY_CPP_CAST) +#undef HEDLEY_CPP_CAST +#endif +#if defined(__cplusplus) +#if HEDLEY_HAS_WARNING("-Wold-style-cast") +#define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wold-style-cast\"")((T)(expr)) \ + HEDLEY_DIAGNOSTIC_POP +#elif HEDLEY_IAR_VERSION_CHECK(8, 3, 0) +#define HEDLEY_CPP_CAST(T, expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("diag_suppress=Pe137") HEDLEY_DIAGNOSTIC_POP #else +#define HEDLEY_CPP_CAST(T, expr) ((T)(expr)) +#endif +#else +#define HEDLEY_CPP_CAST(T, expr) (expr) +#endif + #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || \ HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || \ HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(6, 0, 0) || \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || \ HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) || \ HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) || \ HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) || \ @@ -679,7 +913,12 @@ #elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0) #define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") #define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") -#elif HEDLEY_TI_VERSION_CHECK(8, 1, 0) +#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") #define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") #elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0) @@ -706,7 +945,21 @@ _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") #elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable : 4996)) -#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && !defined(__cplusplus) #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \ @@ -740,7 +993,12 @@ #elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ __pragma(warning(disable : 4068)) -#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#elif HEDLEY_TI_VERSION_CHECK(16, 9, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") +#elif HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") @@ -748,6 +1006,39 @@ #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS #endif +#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) +#undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif +#if HEDLEY_HAS_WARNING("-Wunknown-attributes") +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") +#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#elif HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("warning(disable:1292)") +#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + __pragma(warning(disable : 5030)) +#elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("diag_suppress 1097") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("error_messages(off,attrskipunsup)") +#elif HEDLEY_TI_VERSION_CHECK(18, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 3, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("diag_suppress 1173") +#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \ + _Pragma("diag_suppress=Pe1097") +#else +#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES +#endif + #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) #undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL #endif @@ -771,25 +1062,44 @@ #undef HEDLEY_DEPRECATED_FOR #endif #if defined(__cplusplus) && (__cplusplus >= 201402L) -#define HEDLEY_DEPRECATED(since) [[deprecated("Since " #since)]] -#define HEDLEY_DEPRECATED_FOR(since, replacement) \ - [[deprecated("Since " #since "; use " #replacement)]] +#define HEDLEY_DEPRECATED(since) \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \ + [[deprecated("Since " #since)]]) +#define HEDLEY_DEPRECATED_FOR(since, replacement) \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \ + [[deprecated("Since " #since "; use " #replacement)]]) #elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \ HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_ARM_VERSION_CHECK(5, 6, 0) || \ HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) || \ HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 3, 0) + HEDLEY_TI_VERSION_CHECK(18, 1, 0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(18, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 3, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 0) #define HEDLEY_DEPRECATED(since) \ __attribute__((__deprecated__("Since " #since))) #define HEDLEY_DEPRECATED_FOR(since, replacement) \ __attribute__((__deprecated__("Since " #since "; use " #replacement))) #elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) #define HEDLEY_DEPRECATED_FOR(since, replacement) \ __attribute__((__deprecated__)) @@ -823,21 +1133,48 @@ #if defined(HEDLEY_WARN_UNUSED_RESULT) #undef HEDLEY_WARN_UNUSED_RESULT #endif -#if defined(__cplusplus) && (__cplusplus >= 201703L) -#define HEDLEY_WARN_UNUSED_RESULT [[nodiscard]] +#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG) +#undef HEDLEY_WARN_UNUSED_RESULT_MSG +#endif +#if (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) +#define HEDLEY_WARN_UNUSED_RESULT \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) +#define HEDLEY_WARN_UNUSED_RESULT \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) +#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) #elif HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \ HEDLEY_PGI_VERSION_CHECK(17, 10, 0) #define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \ + __attribute__((__warn_unused_result__)) #elif defined(_Check_return_) /* SAL */ #define HEDLEY_WARN_UNUSED_RESULT _Check_return_ +#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ #else #define HEDLEY_WARN_UNUSED_RESULT +#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) #endif #if defined(HEDLEY_SENTINEL) @@ -861,20 +1198,33 @@ #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L #define HEDLEY_NO_RETURN _Noreturn #elif defined(__cplusplus) && (__cplusplus >= 201103L) -#define HEDLEY_NO_RETURN [[noreturn]] +#define HEDLEY_NO_RETURN \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) #elif HEDLEY_HAS_ATTRIBUTE(noreturn) || HEDLEY_GCC_VERSION_CHECK(3, 2, 0) || \ HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(18, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(17, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_NO_RETURN __attribute__((__noreturn__)) #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) #define HEDLEY_NO_RETURN _Pragma("does_not_return") #elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) #define HEDLEY_NO_RETURN __declspec(noreturn) -#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) #define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") #elif HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0) #define HEDLEY_NO_RETURN __attribute((noreturn)) @@ -884,37 +1234,21 @@ #define HEDLEY_NO_RETURN #endif +#if defined(HEDLEY_NO_ESCAPE) +#undef HEDLEY_NO_ESCAPE +#endif +#if HEDLEY_HAS_ATTRIBUTE(noescape) +#define HEDLEY_NO_ESCAPE __attribute__((__noescape__)) +#else +#define HEDLEY_NO_ESCAPE +#endif + #if defined(HEDLEY_UNREACHABLE) #undef HEDLEY_UNREACHABLE #endif #if defined(HEDLEY_UNREACHABLE_RETURN) #undef HEDLEY_UNREACHABLE_RETURN #endif -#if (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \ - (!defined(HEDLEY_ARM_VERSION))) || \ - HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ - HEDLEY_IBM_VERSION_CHECK(13, 1, 5) -#define HEDLEY_UNREACHABLE() __builtin_unreachable() -#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) -#define HEDLEY_UNREACHABLE() __assume(0) -#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) -#if defined(__cplusplus) -#define HEDLEY_UNREACHABLE() std::_nassert(0) -#else -#define HEDLEY_UNREACHABLE() _nassert(0) -#endif -#define HEDLEY_UNREACHABLE_RETURN(value) return value -#elif defined(EXIT_FAILURE) -#define HEDLEY_UNREACHABLE() abort() -#else -#define HEDLEY_UNREACHABLE() -#define HEDLEY_UNREACHABLE_RETURN(value) return value -#endif -#if !defined(HEDLEY_UNREACHABLE_RETURN) -#define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() -#endif - #if defined(HEDLEY_ASSUME) #undef HEDLEY_ASSUME #endif @@ -922,24 +1256,55 @@ #define HEDLEY_ASSUME(expr) __assume(expr) #elif HEDLEY_HAS_BUILTIN(__builtin_assume) #define HEDLEY_ASSUME(expr) __builtin_assume(expr) -#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) +#elif HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0) #if defined(__cplusplus) #define HEDLEY_ASSUME(expr) std::_nassert(expr) #else #define HEDLEY_ASSUME(expr) _nassert(expr) #endif -#elif (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \ - !defined(HEDLEY_ARM_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ +#endif +#if (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \ + (!defined(HEDLEY_ARM_VERSION))) || \ + HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || \ + HEDLEY_PGI_VERSION_CHECK(18, 10, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_IBM_VERSION_CHECK(13, 1, 5) -#define HEDLEY_ASSUME(expr) ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) +#define HEDLEY_UNREACHABLE() __builtin_unreachable() +#elif defined(HEDLEY_ASSUME) +#define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) +#endif +#if !defined(HEDLEY_ASSUME) +#if defined(HEDLEY_UNREACHABLE) +#define HEDLEY_ASSUME(expr) \ + HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1))) #else -#define HEDLEY_ASSUME(expr) ((void)(expr)) +#define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr) +#endif +#endif +#if defined(HEDLEY_UNREACHABLE) +#if HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0) +#define HEDLEY_UNREACHABLE_RETURN(value) \ + return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value)) +#else +#define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() +#endif +#else +#define HEDLEY_UNREACHABLE_RETURN(value) return (value) +#endif +#if !defined(HEDLEY_UNREACHABLE) +#define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) #endif HEDLEY_DIAGNOSTIC_PUSH -#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) +#if HEDLEY_HAS_WARNING("-Wpedantic") +#pragma clang diagnostic ignored "-Wpedantic" +#endif +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros", 4, 0, 0) #if defined(__clang__) #pragma clang diagnostic ignored "-Wvariadic-macros" #elif defined(HEDLEY_GCC_VERSION) @@ -973,9 +1338,21 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_ARM_VERSION_CHECK(5, 6, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \ __attribute__((__format__(__printf__, string_idx, first_to_check))) #elif HEDLEY_PELLES_VERSION_CHECK(6, 0, 0) @@ -990,7 +1367,7 @@ HEDLEY_DIAGNOSTIC_POP #endif #if defined(__cplusplus) #if __cplusplus >= 201103L -#define HEDLEY_CONSTEXPR constexpr +#define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) #endif #endif #if !defined(HEDLEY_CONSTEXPR) @@ -1010,56 +1387,60 @@ HEDLEY_DIAGNOSTIC_POP #undef HEDLEY_UNPREDICTABLE #endif #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) -#define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr)) +#define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) #endif #if HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \ HEDLEY_GCC_VERSION_CHECK(9, 0, 0) #define HEDLEY_PREDICT(expr, value, probability) \ - __builtin_expect_with_probability(expr, value, probability) + __builtin_expect_with_probability((expr), (value), (probability)) #define HEDLEY_PREDICT_TRUE(expr, probability) \ - __builtin_expect_with_probability(!!(expr), 1, probability) + __builtin_expect_with_probability(!!(expr), 1, (probability)) #define HEDLEY_PREDICT_FALSE(expr, probability) \ - __builtin_expect_with_probability(!!(expr), 0, probability) + __builtin_expect_with_probability(!!(expr), 0, (probability)) #define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#if !defined(HEDLEY_BUILTIN_UNPREDICTABLE) -#define HEDLEY_BUILTIN_UNPREDICTABLE(expr) \ - __builtin_expect_with_probability(!!(expr), 1, 0.5) -#endif #elif HEDLEY_HAS_BUILTIN(__builtin_expect) || \ HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || \ HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(6, 1, 0) || \ - HEDLEY_TINYC_VERSION_CHECK(0, 9, 27) -#define HEDLEY_PREDICT(expr, expected, probability) \ - (((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) \ - : (((void)(expected)), !!(expr))) -#define HEDLEY_PREDICT_TRUE(expr, probability) \ - (__extension__({ \ - HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) \ - ? __builtin_expect(!!(expr), 1) \ - : ((hedley_probability_ <= 0.1) \ - ? __builtin_expect(!!(expr), 0) \ - : !!(expr))); \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 27) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) +#define HEDLEY_PREDICT(expr, expected, probability) \ + (((probability) >= 0.9) \ + ? __builtin_expect((expr), (expected)) \ + : (HEDLEY_STATIC_CAST(void, expected), (expr))) +#define HEDLEY_PREDICT_TRUE(expr, probability) \ + (__extension__({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) \ + ? __builtin_expect(!!(expr), 1) \ + : ((hedley_probability_ <= 0.1) \ + ? __builtin_expect(!!(expr), 0) \ + : !!(expr))); \ })) -#define HEDLEY_PREDICT_FALSE(expr, probability) \ - (__extension__({ \ - HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) \ - ? __builtin_expect(!!(expr), 0) \ - : ((hedley_probability_ <= 0.1) \ - ? __builtin_expect(!!(expr), 1) \ - : !!(expr))); \ +#define HEDLEY_PREDICT_FALSE(expr, probability) \ + (__extension__({ \ + double hedley_probability_ = (probability); \ + ((hedley_probability_ >= 0.9) \ + ? __builtin_expect(!!(expr), 0) \ + : ((hedley_probability_ <= 0.1) \ + ? __builtin_expect(!!(expr), 1) \ + : !!(expr))); \ })) #define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) #else #define HEDLEY_PREDICT(expr, expected, probability) \ - (((void)(expected)), !!(expr)) + (HEDLEY_STATIC_CAST(void, expected), (expr)) #define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) #define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) #define HEDLEY_LIKELY(expr) (!!(expr)) @@ -1077,9 +1458,21 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_MALLOC __attribute__((__malloc__)) #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) #define HEDLEY_MALLOC _Pragma("returns_new_memory") @@ -1097,14 +1490,28 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || \ HEDLEY_PGI_VERSION_CHECK(17, 10, 0) #define HEDLEY_PURE __attribute__((__pure__)) #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) #define HEDLEY_PURE _Pragma("does_not_write_global_data") -#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#elif defined(__cplusplus) && (HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0)) #define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") #else #define HEDLEY_PURE @@ -1118,9 +1525,21 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || \ HEDLEY_PGI_VERSION_CHECK(17, 10, 0) #define HEDLEY_CONST __attribute__((__const__)) #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) @@ -1141,7 +1560,10 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 4) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)) || \ HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || defined(__clang__) #define HEDLEY_RESTRICT __restrict @@ -1159,8 +1581,14 @@ HEDLEY_DIAGNOSTIC_POP #define HEDLEY_INLINE inline #elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0) #define HEDLEY_INLINE __inline__ -#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \ - HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || HEDLEY_TI_VERSION_CHECK(8, 0, 0) +#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_INLINE __inline #else #define HEDLEY_INLINE @@ -1169,19 +1597,36 @@ HEDLEY_DIAGNOSTIC_POP #if defined(HEDLEY_ALWAYS_INLINE) #undef HEDLEY_ALWAYS_INLINE #endif -#if HEDLEY_HAS_ATTRIBUTE(always_inline) || \ - HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ - HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ - HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) +#if HEDLEY_HAS_ATTRIBUTE(always_inline) || \ + HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE #elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) #define HEDLEY_ALWAYS_INLINE __forceinline -#elif HEDLEY_TI_VERSION_CHECK(7, 0, 0) && defined(__cplusplus) +#elif defined(__cplusplus) && (HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)) #define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) #define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") @@ -1197,15 +1642,27 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + HEDLEY_TI_VERSION_CHECK(15, 12, 0) || \ + (HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \ + (HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || \ + (HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \ + (HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) #define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) #elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) #define HEDLEY_NEVER_INLINE __declspec(noinline) #elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0) #define HEDLEY_NEVER_INLINE _Pragma("noinline") -#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#elif HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) #define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0) #define HEDLEY_NEVER_INLINE _Pragma("inline=never") @@ -1236,9 +1693,9 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(8, 0, 0) || \ - (HEDLEY_TI_VERSION_CHECK(7, 3, 0) && defined(__TI_EABI__) && \ - defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) + (defined(__TI_EABI__) && ((HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && \ + defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0))) #define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) #define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) #else @@ -1263,29 +1720,20 @@ HEDLEY_DIAGNOSTIC_POP #if defined(HEDLEY_FALL_THROUGH) #undef HEDLEY_FALL_THROUGH #endif -#if defined(__cplusplus) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) && \ - !defined(HEDLEY_PGI_VERSION) -#if (__cplusplus >= 201703L) || \ - ((__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)) -#define HEDLEY_FALL_THROUGH [[fallthrough]] -#elif (__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(clang::fallthrough) -#define HEDLEY_FALL_THROUGH [[clang::fallthrough]] -#elif (__cplusplus >= 201103L) && HEDLEY_GCC_VERSION_CHECK(7, 0, 0) -#define HEDLEY_FALL_THROUGH [[gnu::fallthrough]] -#endif -#endif -#if !defined(HEDLEY_FALL_THROUGH) #if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough, 7, 0, 0) && \ !defined(HEDLEY_PGI_VERSION) #define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) +#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang, fallthrough) +#define HEDLEY_FALL_THROUGH \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) +#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) +#define HEDLEY_FALL_THROUGH \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) #elif defined(__fallthrough) /* SAL */ #define HEDLEY_FALL_THROUGH __fallthrough #else #define HEDLEY_FALL_THROUGH #endif -#endif #if defined(HEDLEY_RETURNS_NON_NULL) #undef HEDLEY_RETURNS_NON_NULL @@ -1315,19 +1763,18 @@ HEDLEY_DIAGNOSTIC_POP #if defined(HEDLEY_REQUIRE_CONSTEXPR) #undef HEDLEY_REQUIRE_CONSTEXPR #endif -/* Note the double-underscore. For internal use only; no API - * guarantees! */ -#if defined(HEDLEY__IS_CONSTEXPR) -#undef HEDLEY__IS_CONSTEXPR +/* HEDLEY_IS_CONSTEXPR_ is for + HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ +#if defined(HEDLEY_IS_CONSTEXPR_) +#undef HEDLEY_IS_CONSTEXPR_ #endif - #if HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) || \ HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ - HEDLEY_TI_VERSION_CHECK(6, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) || \ (HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) || \ HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) #define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) @@ -1341,49 +1788,53 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_ARM_VERSION_CHECK(5, 4, 0) || \ HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) #if defined(__INTPTR_TYPE__) -#define HEDLEY__IS_CONSTEXPR(expr) \ +#define HEDLEY_IS_CONSTEXPR_(expr) \ __builtin_types_compatible_p( \ __typeof__((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) \ : (int *)0)), \ int *) #else #include -#define HEDLEY__IS_CONSTEXPR(expr) \ +#define HEDLEY_IS_CONSTEXPR_(expr) \ __builtin_types_compatible_p( \ __typeof__((1 ? (void *)((intptr_t)((expr)*0)) : (int *)0)), \ int *) #endif -#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ - !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION)) || \ - HEDLEY_HAS_EXTENSION(c_generic_selections) || \ - HEDLEY_GCC_VERSION_CHECK(4, 9, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) || \ - HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || \ +#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION) && \ + !defined(HEDLEY_IAR_VERSION)) || \ + HEDLEY_HAS_EXTENSION(c_generic_selections) || \ + HEDLEY_GCC_VERSION_CHECK(4, 9, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || \ HEDLEY_ARM_VERSION_CHECK(5, 3, 0) #if defined(__INTPTR_TYPE__) -#define HEDLEY__IS_CONSTEXPR(expr) \ +#define HEDLEY_IS_CONSTEXPR_(expr) \ _Generic((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) : (int *)0), \ int * : 1, void * : 0) #else #include -#define HEDLEY__IS_CONSTEXPR(expr) \ +#define HEDLEY_IS_CONSTEXPR_(expr) \ _Generic((1 ? (void *)((intptr_t)*0) : (int *)0), int * : 1, void * : 0) #endif -#elif defined(HEDLEY_GCC_VERSION) || defined(HEDLEY_INTEL_VERSION) || \ - defined(HEDLEY_TINYC_VERSION) || defined(HEDLEY_TI_VERSION) || \ - defined(__clang__) -#define HEDLEY__IS_CONSTEXPR(expr) \ +#elif defined(HEDLEY_GCC_VERSION) || defined(HEDLEY_INTEL_VERSION) || \ + defined(HEDLEY_TINYC_VERSION) || defined(HEDLEY_TI_ARMCL_VERSION) || \ + HEDLEY_TI_CL430_VERSION_CHECK(18, 12, 0) || \ + defined(HEDLEY_TI_CL2000_VERSION) || \ + defined(HEDLEY_TI_CL6X_VERSION) || defined(HEDLEY_TI_CL7X_VERSION) || \ + defined(HEDLEY_TI_CLPRU_VERSION) || defined(__clang__) +#define HEDLEY_IS_CONSTEXPR_(expr) \ (sizeof(void) != sizeof(*(1 ? ((void *)((expr)*0L)) : ((struct { \ char v[sizeof(void) * 2]; \ } *)1)))) #endif #endif -#if defined(HEDLEY__IS_CONSTEXPR) +#if defined(HEDLEY_IS_CONSTEXPR_) #if !defined(HEDLEY_IS_CONSTANT) -#define HEDLEY_IS_CONSTANT(expr) HEDLEY__IS_CONSTEXPR(expr) +#define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr) #endif #define HEDLEY_REQUIRE_CONSTEXPR(expr) \ - (HEDLEY__IS_CONSTEXPR(expr) ? (expr) : (-1)) + (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) #else #if !defined(HEDLEY_IS_CONSTANT) #define HEDLEY_IS_CONSTANT(expr) (0) @@ -1420,56 +1871,29 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert)) #define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) #elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) || \ - (defined(__cplusplus) && HEDLEY_TI_VERSION_CHECK(8, 3, 0)) -#define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr, message) + HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) +#define HEDLEY_STATIC_ASSERT(expr, message) \ + HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \ + static_assert(expr, message)) #else #define HEDLEY_STATIC_ASSERT(expr, message) #endif -#if defined(HEDLEY_CONST_CAST) -#undef HEDLEY_CONST_CAST +#if defined(HEDLEY_NULL) +#undef HEDLEY_NULL #endif #if defined(__cplusplus) -#define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) -#elif HEDLEY_HAS_WARNING("-Wcast-qual") || \ - HEDLEY_GCC_VERSION_CHECK(4, 6, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) -#define HEDLEY_CONST_CAST(T, expr) \ - (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL((T)(expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -#define HEDLEY_CONST_CAST(T, expr) ((T)(expr)) -#endif - -#if defined(HEDLEY_REINTERPRET_CAST) -#undef HEDLEY_REINTERPRET_CAST -#endif -#if defined(__cplusplus) -#define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) +#if __cplusplus >= 201103L +#define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) +#elif defined(NULL) +#define HEDLEY_NULL NULL #else -#define HEDLEY_REINTERPRET_CAST(T, expr) (*((T *)&(expr))) +#define HEDLEY_NULL HEDLEY_STATIC_CAST(void *, 0) #endif - -#if defined(HEDLEY_STATIC_CAST) -#undef HEDLEY_STATIC_CAST -#endif -#if defined(__cplusplus) -#define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) +#elif defined(NULL) +#define HEDLEY_NULL NULL #else -#define HEDLEY_STATIC_CAST(T, expr) ((T)(expr)) -#endif - -#if defined(HEDLEY_CPP_CAST) -#undef HEDLEY_CPP_CAST -#endif -#if defined(__cplusplus) -#define HEDLEY_CPP_CAST(T, expr) static_cast(expr) -#else -#define HEDLEY_CPP_CAST(T, expr) (expr) +#define HEDLEY_NULL ((void *)0) #endif #if defined(HEDLEY_MESSAGE) @@ -1502,7 +1926,9 @@ HEDLEY_DIAGNOSTIC_POP HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ HEDLEY_PRAGMA(clang warning msg) \ HEDLEY_DIAGNOSTIC_POP -#elif HEDLEY_GCC_VERSION_CHECK(4, 8, 0) || HEDLEY_PGI_VERSION_CHECK(18, 4, 0) +#elif HEDLEY_GCC_VERSION_CHECK(4, 8, 0) || \ + HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) #elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) @@ -1510,29 +1936,35 @@ HEDLEY_DIAGNOSTIC_POP #define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) #endif +#if defined(HEDLEY_REQUIRE) +#undef HEDLEY_REQUIRE +#endif #if defined(HEDLEY_REQUIRE_MSG) #undef HEDLEY_REQUIRE_MSG #endif #if HEDLEY_HAS_ATTRIBUTE(diagnose_if) #if HEDLEY_HAS_WARNING("-Wgcc-compat") -#define HEDLEY_REQUIRE_MSG(expr, msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((__diagnose_if__(!(expr), msg, "error"))) \ +#define HEDLEY_REQUIRE(expr) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) \ + HEDLEY_DIAGNOSTIC_POP +#define HEDLEY_REQUIRE_MSG(expr, msg) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ + __attribute__((diagnose_if(!(expr), msg, "error"))) \ HEDLEY_DIAGNOSTIC_POP #else +#define HEDLEY_REQUIRE(expr) \ + __attribute__((diagnose_if(!(expr), #expr, "error"))) #define HEDLEY_REQUIRE_MSG(expr, msg) \ - __attribute__((__diagnose_if__(!(expr), msg, "error"))) + __attribute__((diagnose_if(!(expr), msg, "error"))) #endif #else +#define HEDLEY_REQUIRE(expr) #define HEDLEY_REQUIRE_MSG(expr, msg) #endif -#if defined(HEDLEY_REQUIRE) -#undef HEDLEY_REQUIRE -#endif -#define HEDLEY_REQUIRE(expr) HEDLEY_REQUIRE_MSG(expr, #expr) - #if defined(HEDLEY_FLAGS) #undef HEDLEY_FLAGS #endif diff --git a/libobs/util/simde/mmx.h b/libobs/util/simde/mmx.h index fd38acbcd78013abe5c09cc0d6c8c0097ff8d44b..886ed81f47416c5194f35f77e5ca0652d057af3b 100644 --- a/libobs/util/simde/mmx.h +++ b/libobs/util/simde/mmx.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2017-2018 Evan Nemerson +/* SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -19,64 +19,71 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. + * + * Copyright: + * 2017-2020 Evan Nemerson */ -#if !defined(SIMDE__MMX_H) -#if !defined(SIMDE__MMX_H) -#define SIMDE__MMX_H -#endif +#if !defined(SIMDE_X86_MMX_H) +#define SIMDE_X86_MMX_H + #include "simde-common.h" -#if defined(SIMDE_MMX_FORCE_NATIVE) -#define SIMDE_MMX_NATIVE -#elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \ - !defined(SIMDE_NO_NATIVE) -#define SIMDE_MMX_NATIVE -#elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \ - !defined(SIMDE_NO_NEON) -#define SIMDE_MMX_NEON +#if !defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES +#endif + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS + +#if defined(SIMDE_X86_MMX_NATIVE) +#define SIMDE_X86_MMX_USE_NATIVE_TYPE +#elif defined(SIMDE_X86_SSE_NATIVE) +#define SIMDE_X86_MMX_USE_NATIVE_TYPE #endif -#if defined(SIMDE_MMX_NATIVE) +#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) #include -#else -#if defined(SIMDE_MMX_NEON) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #include #endif -#endif + #include #include -#include -#include -SIMDE__BEGIN_DECLS +SIMDE_BEGIN_DECLS_ typedef union { -#if defined(SIMDE__ENABLE_GCC_VEC_EXT) - int8_t i8 __attribute__((__vector_size__(8), __may_alias__)); - int16_t i16 __attribute__((__vector_size__(8), __may_alias__)); - int32_t i32 __attribute__((__vector_size__(8), __may_alias__)); - int64_t i64 __attribute__((__vector_size__(8), __may_alias__)); - uint8_t u8 __attribute__((__vector_size__(8), __may_alias__)); - uint16_t u16 __attribute__((__vector_size__(8), __may_alias__)); - uint32_t u32 __attribute__((__vector_size__(8), __may_alias__)); - uint64_t u64 __attribute__((__vector_size__(8), __may_alias__)); - simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__)); -#else - int8_t i8[8]; - int16_t i16[4]; - int32_t i32[2]; - int64_t i64[1]; - uint8_t u8[8]; - uint16_t u16[4]; - uint32_t u32[2]; - uint64_t u64[1]; - simde_float32 f32[2]; -#endif - -#if defined(SIMDE_MMX_NATIVE) +#if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; +#else + SIMDE_ALIGN(8) int8_t i8[8]; + SIMDE_ALIGN(8) int16_t i16[4]; + SIMDE_ALIGN(8) int32_t i32[2]; + SIMDE_ALIGN(8) int64_t i64[1]; + SIMDE_ALIGN(8) uint8_t u8[8]; + SIMDE_ALIGN(8) uint16_t u16[4]; + SIMDE_ALIGN(8) uint32_t u32[2]; + SIMDE_ALIGN(8) uint64_t u64[1]; + SIMDE_ALIGN(8) simde_float32 f32[2]; + SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)]; +#endif + +#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) __m64 n; -#elif defined(SIMDE_MMX_NEON) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int8x8_t neon_i8; int16x4_t neon_i16; int32x2_t neon_i32; @@ -87,1270 +94,2177 @@ typedef union { uint64x1_t neon_u64; float32x2_t neon_f32; #endif -} simde__m64; +} simde__m64_private; + +#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) +typedef __m64 simde__m64; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +typedef int32x2_t simde__m64; +#elif defined(SIMDE_VECTOR_SUBSCRIPT) +typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; +#else +typedef simde__m64_private simde__m64; +#endif + +#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \ + defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES +typedef simde__m64 __m64; +#endif -#if defined(SIMDE_MMX_NATIVE) -HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64), - "__m64 size doesn't match simde__m64 size"); -SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v) +HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); +HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect"); +#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8, + "simde__m64 is not 8-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8, + "simde__m64_private is not 8-byte aligned"); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m64 simde__m64_from_private(simde__m64_private v) { simde__m64 r; - r.n = v; + simde_memcpy(&r, &v, sizeof(r)); return r; } -#elif defined(SIMDE_MMX_NEON) -#define SIMDE__M64_NEON_C(T, expr) \ - (simde__m64) { .neon_##T = (expr) } -#endif -HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES +simde__m64_private simde__m64_to_private(simde__m64 v) +{ + simde__m64_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \ + fragment) \ + SIMDE_FUNCTION_ATTRIBUTES \ + simde__##simde_type simde__##simde_type##_from_##isax##_##fragment( \ + source_type value) \ + { \ + simde__##simde_type##_private r_; \ + r_.isax##_##fragment = value; \ + return simde__##simde_type##_from_private(r_); \ + } \ + \ + SIMDE_FUNCTION_ATTRIBUTES \ + source_type simde__##simde_type##_to_##isax##_##fragment( \ + simde__##simde_type value) \ + { \ + simde__##simde_type##_private r_ = \ + simde__##simde_type##_to_private(value); \ + return r_.isax##_##fragment; \ + } + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32) +#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ + +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_add_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_add_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < 8; i++) { - r.i8[i] = a.i8[i] + b.i8[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = a_.i8 + b_.i8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = a_.i8[i] + b_.i8[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b) +#define _m_paddb(a, b) simde_m_paddb(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_add_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_add_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - r.i16[i] = a.i16[i] + b.i16[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = a_.i16 + b_.i16; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] + b_.i16[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b) +#define _m_add_paddw(a, b) simde_mm_add_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_add_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_add_pi32(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) { - r.i32[i] = a.i32[i] + b.i32[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 + b_.i32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] + b_.i32[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b) +#define _m_add_paddd(a, b) simde_mm_add_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_adds_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 8; i++) { - if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { - r.i8[i] = INT8_MAX; - } else if ((((b.i8[i]) < 0) && - ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { - r.i8[i] = INT8_MIN; + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + if ((((b_.i8[i]) > 0) && + ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) { + r_.i8[i] = INT8_MAX; + } else if ((((b_.i8[i]) < 0) && + ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) { + r_.i8[i] = INT8_MIN; } else { - r.i8[i] = (a.i8[i]) + (b.i8[i]); + r_.i8[i] = (a_.i8[i]) + (b_.i8[i]); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b) +#define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_adds_pu8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < 8; i++) { - const int32_t x = a.u8[i] + b.u8[i]; - if (x < 0) - r.u8[i] = 0; - else if (x > UINT8_MAX) - r.u8[i] = UINT8_MAX; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + const uint_fast16_t x = + HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) + + HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]); + if (x > UINT8_MAX) + r_.u8[i] = UINT8_MAX; else - r.u8[i] = (uint8_t)x; + r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b) +#define _m_paddusb(a, b) simde_mm_adds_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_adds_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i++) { - if ((((b.i16[i]) > 0) && - ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { - r.i16[i] = INT16_MAX; - } else if ((((b.i16[i]) < 0) && - ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) { - r.i16[i] = SHRT_MIN; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if ((((b_.i16[i]) > 0) && + ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) { + r_.i16[i] = INT16_MAX; + } else if ((((b_.i16[i]) < 0) && + ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) { + r_.i16[i] = SHRT_MIN; } else { - r.i16[i] = (a.i16[i]) + (b.i16[i]); + r_.i16[i] = (a_.i16[i]) + (b_.i16[i]); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b) +#define _m_paddsw(a, b) simde_mm_adds_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_adds_pu16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - const uint32_t x = a.u16[i] + b.u16[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + const uint32_t x = a_.u16[i] + b_.u16[i]; if (x > UINT16_MAX) - r.u16[i] = UINT16_MAX; + r_.u16[i] = UINT16_MAX; else - r.u16[i] = (uint16_t)x; + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b) +#define _m_paddusw(a, b) simde_mm_adds_pu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_and_si64(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_and_si64(a, b); #else - simde__m64 r; - r.i64[0] = a.i64[0] & b.i64[0]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 & b_.i64; +#else + r_.i64[0] = a_.i64[0] & b_.i64[0]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pand(a, b) simde_mm_and_si64(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_and_si64(a, b) simde_mm_and_si64(a, b) +#define _m_pand(a, b) simde_mm_and_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_andnot_si64(a, b); #else - simde__m64 r; - r.i64[0] = ~(a.i64[0]) & b.i64[0]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = ~a_.i32f & b_.i32f; +#else + r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]); +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b) +#define _m_pandn(a, b) simde_mm_andnot_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpeq_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 8; i++) { - r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b) +#define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpeq_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i++) { - r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b) +#define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpeq_pi32(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 2; i++) { - r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b) +#define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpgt_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 8; i++) { - r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b) +#define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpgt_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i++) { - r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b) +#define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cmpgt_pi32(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 2; i++) { - r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b) +#define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int64_t simde_mm_cvtm64_si64(simde__m64 a) { -#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return _mm_cvtm64_si64(a.n); +#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \ + !defined(__PGI) + return _mm_cvtm64_si64(a); +#else + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_lane_s64(a_.neon_i64, 0); #else - return a.i64[0]; + return a_.i64[0]; +#endif #endif } #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a) +#define _m_to_int64(a) simde_mm_cvtm64_si64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtsi32_si64(int32_t a) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_cvtsi32_si64(a)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtsi32_si64(a); #else - simde__m64 r; - r.i32[0] = a; - r.i32[1] = 0; - return r; + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0}; + r_.neon_i32 = vld1_s32(av); +#else + r_.i32[0] = a; + r_.i32[1] = 0; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a) +#define _m_from_int(a) simde_mm_cvtsi32_si64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtsi64_m64(int64_t a) { -#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return SIMDE__M64_C(_mm_cvtsi64_m64(a)); +#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \ + !defined(__PGI) + return _mm_cvtsi64_m64(a); #else - simde__m64 r; - r.i64[0] = a; - return r; + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vld1_s64(&a); +#else + r_.i64[0] = a; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a) +#define _m_from_int64(a) simde_mm_cvtsi64_m64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsi64_si32(simde__m64 a) { -#if defined(SIMDE_MMX_NATIVE) - return _mm_cvtsi64_si32(a.n); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtsi64_si32(a); +#else + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_lane_s32(a_.neon_i32, 0); #else - return a.i32[0]; + return a_.i32[0]; +#endif #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_empty(void) { -#if defined(SIMDE_MMX_NATIVE) +#if defined(SIMDE_X86_MMX_NATIVE) _mm_empty(); #else #endif } #define simde_m_empty() simde_mm_empty() +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_empty() simde_mm_empty() +#define _m_empty() simde_mm_empty() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_madd_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i += 2) { - r.i32[i / 2] = - (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16); + r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) { + r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + + (a_.i16[i + 1] * b_.i16[i + 1]); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b) +#define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_mulhi_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i++) { - r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16); + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); + const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16); + const uint16x4_t t3 = vmovn_u32(t2); + r_.neon_i16 = vreinterpret_s16_u16(t3); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, + ((a_.i16[i] * b_.i16[i]) >> 16)); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b) +#define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_mullo_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (int i = 0; i < 4; i++) { - r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff); + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); + const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1)); + r_.neon_i16 = vreinterpret_s16_u16(t2); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = HEDLEY_STATIC_CAST( + int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff)); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b) +#define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_or_si64(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_or_si64(a, b); #else - simde__m64 r; - r.i64[0] = a.i64[0] | b.i64[0]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 | b_.i64; +#else + r_.i64[0] = a_.i64[0] | b_.i64[0]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_por(a, b) simde_mm_or_si64(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_or_si64(a, b) simde_mm_or_si64(a, b) +#define _m_por(a, b) simde_mm_or_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_packs_pi16(a, b); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - if (a.i16[i] < INT8_MIN) { - r.i8[i] = INT8_MIN; - } else if (a.i16[i] > INT8_MAX) { - r.i8[i] = INT8_MAX; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if (a_.i16[i] < INT8_MIN) { + r_.i8[i] = INT8_MIN; + } else if (a_.i16[i] > INT8_MAX) { + r_.i8[i] = INT8_MAX; } else { - r.i8[i] = (int8_t)a.i16[i]; + r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]); } } - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - if (b.i16[i] < INT8_MIN) { - r.i8[i + 4] = INT8_MIN; - } else if (b.i16[i] > INT8_MAX) { - r.i8[i + 4] = INT8_MAX; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if (b_.i16[i] < INT8_MIN) { + r_.i8[i + 4] = INT8_MIN; + } else if (b_.i16[i] > INT8_MAX) { + r_.i8[i + 4] = INT8_MAX; } else { - r.i8[i + 4] = (int8_t)b.i16[i]; + r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]); } } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b) +#define _m_packsswb(a, b) mm_packs_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_packs_pi32(a, b); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) { - if (a.i32[i] < SHRT_MIN) { - r.i16[i] = SHRT_MIN; - } else if (a.i32[i] > INT16_MAX) { - r.i16[i] = INT16_MAX; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) { + if (a_.i32[i] < SHRT_MIN) { + r_.i16[i] = SHRT_MIN; + } else if (a_.i32[i] > INT16_MAX) { + r_.i16[i] = INT16_MAX; } else { - r.i16[i] = (int16_t)a.i32[i]; + r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]); } } - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) { - if (b.i32[i] < SHRT_MIN) { - r.i16[i + 2] = SHRT_MIN; - } else if (b.i32[i] > INT16_MAX) { - r.i16[i + 2] = INT16_MAX; + SIMDE_VECTORIZE + for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) { + if (b_.i32[i] < SHRT_MIN) { + r_.i16[i + 2] = SHRT_MIN; + } else if (b_.i32[i] > INT16_MAX) { + r_.i16[i + 2] = INT16_MAX; } else { - r.i16[i + 2] = (int16_t)b.i32[i]; + r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]); } } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b) +#define _m_packssdw(a, b) simde_mm_packs_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_packs_pu16(a, b); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - if (a.i16[i] > UINT8_MAX) { - r.u8[i] = UINT8_MAX; - } else if (a.i16[i] < 0) { - r.u8[i] = 0; +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16); + + /* Set elements which are < 0 to 0 */ + const int16x8_t t2 = + vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1))); + + /* Vector with all s16 elements set to UINT8_MAX */ + const int16x8_t vmax = vmovq_n_s16((int16_t)UINT8_MAX); + + /* Elements which are within the acceptable range */ + const int16x8_t le_max = + vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax))); + const int16x8_t gt_max = + vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax))); + + /* Final values as 16-bit integers */ + const int16x8_t values = vorrq_s16(le_max, gt_max); + + r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if (a_.i16[i] > UINT8_MAX) { + r_.u8[i] = UINT8_MAX; + } else if (a_.i16[i] < 0) { + r_.u8[i] = 0; } else { - r.u8[i] = (int8_t)a.i16[i]; + r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]); } } - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - if (b.i16[i] > UINT8_MAX) { - r.u8[i + 4] = UINT8_MAX; - } else if (b.i16[i] < 0) { - r.u8[i + 4] = 0; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if (b_.i16[i] > UINT8_MAX) { + r_.u8[i + 4] = UINT8_MAX; + } else if (b_.i16[i] < 0) { + r_.u8[i + 4] = 0; } else { - r.u8[i + 4] = (int8_t)b.i16[i]; + r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]); } } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b) +#define _m_packuswb(a, b) simde_mm_packs_pu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0); #else - simde__m64 r; - r.i8[0] = e0; - r.i8[1] = e1; - r.i8[2] = e2; - r.i8[3] = e3; - r.i8[4] = e4; - r.i8[5] = e5; - r.i8[6] = e6; - r.i8[7] = e7; - return r; + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3, + e4, e5, e6, e7}; + r_.neon_i8 = vld1_s8(v); +#else + r_.i8[0] = e0; + r_.i8[1] = e1; + r_.i8[2] = e2; + r_.i8[3] = e3; + r_.i8[4] = e4; + r_.i8[5] = e5; + r_.i8[6] = e6; + r_.i8[7] = e7; +#endif + + return simde__m64_from_private(r_); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5, - (int8_t)e4, (int8_t)e3, (int8_t)e2, - (int8_t)e1, (int8_t)e0)); + simde__m64_private r_; + +#if defined(SIMDE_X86_MMX_NATIVE) + r_.n = _mm_set_pi8( + HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6), + HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4), + HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2), + HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0)); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3, + e4, e5, e6, e7}; + r_.neon_u8 = vld1_u8(v); #else - simde__m64 r; - r.u8[0] = e0; - r.u8[1] = e1; - r.u8[2] = e2; - r.u8[3] = e3; - r.u8[4] = e4; - r.u8[5] = e5; - r.u8[6] = e6; - r.u8[7] = e7; - return r; + r_.u8[0] = e0; + r_.u8[1] = e1; + r_.u8[2] = e2; + r_.u8[3] = e3; + r_.u8[4] = e4; + r_.u8[5] = e5; + r_.u8[6] = e6; + r_.u8[7] = e7; #endif + + return simde__m64_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_set_pi16(e3, e2, e1, e0); #else - simde__m64 r; - r.i16[0] = e0; - r.i16[1] = e1; - r.i16[2] = e2; - r.i16[3] = e3; - return r; + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3}; + r_.neon_i16 = vld1_s16(v); +#else + r_.i16[0] = e0; + r_.i16[1] = e1; + r_.i16[2] = e2; + r_.i16[3] = e3; +#endif + return simde__m64_from_private(r_); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1, - (int16_t)e0)); + simde__m64_private r_; + +#if defined(SIMDE_X86_MMX_NATIVE) + r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3), + HEDLEY_STATIC_CAST(int16_t, e2), + HEDLEY_STATIC_CAST(int16_t, e1), + HEDLEY_STATIC_CAST(int16_t, e0)); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3}; + r_.neon_u16 = vld1_u16(v); #else - simde__m64 r; - r.u16[0] = e0; - r.u16[1] = e1; - r.u16[2] = e2; - r.u16[3] = e3; - return r; + r_.u16[0] = e0; + r_.u16[1] = e1; + r_.u16[2] = e2; + r_.u16[3] = e3; #endif + + return simde__m64_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0)); + simde__m64_private r_; + +#if defined(SIMDE_X86_MMX_NATIVE) + r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1), + HEDLEY_STATIC_CAST(int32_t, e0)); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1}; + r_.neon_u32 = vld1_u32(v); #else - simde__m64 r; - r.u32[0] = e0; - r.u32[1] = e1; - return r; + r_.u32[0] = e0; + r_.u32[1] = e1; #endif + + return simde__m64_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set_pi32(e1, e0)); + simde__m64_private r_; + +#if defined(SIMDE_X86_MMX_NATIVE) + r_.n = _mm_set_pi32(e1, e0); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1}; + r_.neon_i32 = vld1_s32(v); #else - simde__m64 r; - r.i32[0] = e0; - r.i32[1] = e1; - return r; + r_.i32[0] = e0; + r_.i32[1] = e1; #endif + + return simde__m64_from_private(r_); } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_set_pi64(int64_t e0) +{ + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0}; + r_.neon_i64 = vld1_s64(v); +#else + r_.i64[0] = e0; +#endif + + return simde__m64_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0) +{ + simde__m64_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1}; + r_.neon_f32 = vld1_f32(v); +#else + r_.f32[0] = e0; + r_.f32[1] = e1; +#endif + + return simde__m64_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set1_pi8(int8_t a) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set1_pi8(a)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_set1_pi8(a); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private r_; + r_.neon_i8 = vmov_n_s8(a); + return simde__m64_from_private(r_); #else return simde_mm_set_pi8(a, a, a, a, a, a, a, a); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set1_pi8(a) simde_mm_set1_pi8(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set1_pi16(int16_t a) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set1_pi16(a)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_set1_pi16(a); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private r_; + r_.neon_i16 = vmov_n_s16(a); + return simde__m64_from_private(r_); #else return simde_mm_set_pi16(a, a, a, a); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set1_pi16(a) simde_mm_set1_pi16(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_set1_pi32(int32_t a) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_set1_pi32(a)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_set1_pi32(a); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private r_; + r_.neon_i32 = vmov_n_s32(a); + return simde__m64_from_private(r_); #else return simde_mm_set_pi32(a, a); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_set1_pi32(a) simde_mm_set1_pi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0); #else return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_setr_pi16(e3, e2, e1, e0); #else return simde_mm_set_pi16(e0, e1, e2, e3); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_setr_pi32(e1, e0)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_setr_pi32(e1, e0); #else return simde_mm_set_pi32(e0, e1); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_setzero_si64(void) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_setzero_si64()); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_setzero_si64(); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private r_; + r_.neon_u32 = vmov_n_u32(0); + return simde__m64_from_private(r_); #else return simde_mm_set_pi32(0, 0); #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_setzero_si64() simde_mm_setzero_si64() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES +simde__m64 simde_x_mm_setone_si64(void) +{ + return simde_mm_set1_pi32(~INT32_C(0)); +} + +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sll_pi16(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)vget_lane_u64( + count_.neon_u64, 0))); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 << count_.u64[0]; +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = a.u16[i] << count.u64[0]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, + a_.u16[i] << count_.u64[0]); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count) +#define _m_psllw(a, count) simde_mm_sll_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sll_pi32(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)vget_lane_u64( + count_.neon_u64, 0))); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 << count_.u64[0]; +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { - r.u32[i] = a.u32[i] << count.u64[0]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] << count_.u64[0]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count) +#define _m_pslld(a, count) simde_mm_sll_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_slli_pi16(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_slli_pi16(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_slli_pi16(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = a.u16[i] << count; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 << count; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count); } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count) +#define _m_psllwi(a, count) simde_mm_slli_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_slli_pi32(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_slli_pi32(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_slli_pi32(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int)); i++) { - r.u32[i] = a.u32[i] << count; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 << count; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] << count; } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count) +#define _m_pslldi(a, count) simde_mm_slli_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_slli_si64(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_slli_si64(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_slli_si64(a, count); #else - simde__m64 r; - r.u64[0] = a.u64[0] << count; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i64 = a_.i64 << count; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count)); +#else + r_.u64[0] = a_.u64[0] << count; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count) +#define _m_psllqi(a, count) simde_mm_slli_si64(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sll_si64(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sll_si64(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 << count_.i64; +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - r.u64[0] = a.u64[0] << count.u64[0]; + r_.u64[0] = a_.u64[0] << count_.u64[0]; +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count) +#define _m_psllq(a, count) simde_mm_sll_si64(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_srl_pi16(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 15)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 >> count_.u64[0]; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vshl_u16( + a_.neon_u16, + vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0)))); +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - SIMDE__VECTORIZE - for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) { - r.u16[i] = a.u16[i] >> count.u64[0]; + SIMDE_VECTORIZE + for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) { + r_.u16[i] = a_.u16[i] >> count_.u64[0]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count) +#define _m_psrlw(a, count) simde_mm_srl_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_srl_pi32(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 31)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> count_.u64[0]; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vshl_u32( + a_.neon_u32, + vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0)))); +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - SIMDE__VECTORIZE - for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) { - r.u32[i] = a.u32[i] >> count.u64[0]; + SIMDE_VECTORIZE + for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) { + r_.u32[i] = a_.u32[i] >> count_.u64[0]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count) +#define _m_psrld(a, count) simde_mm_srl_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srli_pi16(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_srli_pi16(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_srli_pi16(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) { - r.u16[i] = a.u16[i] >> count; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 >> count; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = a_.u16[i] >> count; } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count) +#define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srli_pi32(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_srli_pi32(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_srli_pi32(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int)); i++) { - r.u32[i] = a.u32[i] >> count; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> count; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] >> count; } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count) +#define _m_psrldi(a, count) simde_mm_srli_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srli_si64(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_srli_si64(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_srli_si64(a, count); #else - simde__m64 r; - r.u64[0] = a.u64[0] >> count; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count)); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u64 = a_.u64 >> count; +#else + r_.u64[0] = a_.u64[0] >> count; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count) +#define _m_psrlqi(a, count) simde_mm_srli_si64(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_srl_si64(a.n, count.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_srl_si64(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); - if (HEDLEY_UNLIKELY(count.u64[0] > 63)) { - memset(&r, 0, sizeof(r)); - return r; +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64)); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = a_.u64 >> count_.u64; +#else + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + simde_memset(&r_, 0, sizeof(r_)); + return simde__m64_from_private(r_); } - r.u64[0] = a.u64[0] >> count.u64[0]; - return r; + r_.u64[0] = a_.u64[0] >> count_.u64[0]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count) +#define _m_psrlq(a, count) simde_mm_srl_si64(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srai_pi16(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_srai_pi16(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_srai_pi16(a, count); #else - simde__m64 r; - - const uint16_t m = - (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count)); + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - const uint16_t is_neg = ((uint16_t)( - ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1)))); - r.u16[i] = (a.u16[i] >> count) | (m * is_neg); +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 >> (count & 0xff); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] >> (count & 0xff); } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count) +#define _m_psrawi(a, count) simde_mm_srai_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_srai_pi32(simde__m64 a, int count) { -#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI) - return SIMDE__M64_C(_mm_srai_pi32(a.n, count)); +#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) + return _mm_srai_pi32(a, count); #else - simde__m64 r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - const uint32_t m = - (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count)); - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int)); i++) { - const uint32_t is_neg = ((uint32_t)( - ((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1)))); - r.u32[i] = (a.u32[i] >> count) | (m * is_neg); +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> (count & 0xff); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshl_s32(a_.neon_i32, + vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] >> (count & 0xff); } +#endif - return r; + return simde__m64_from_private(r_); #endif } -#define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count) +#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count) +#define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n)); -#else - simde__m64 r; - int cnt = (int)count.i64[0]; - - if (cnt > 15 || cnt < 0) { - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); - i++) { - r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000; - } - } else { - const uint16_t m = (uint16_t)( - (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt)); - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); - i++) { - const uint16_t is_neg = a.i16[i] < 0; - r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg); - } +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sra_pi16(a, count); +#else + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); + const int cnt = HEDLEY_STATIC_CAST( + int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 >> cnt; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = + vshl_s16(a_.neon_i16, + vmov_n_s16(-HEDLEY_STATIC_CAST( + int16_t, vget_lane_u64(count_.neon_u64, 0)))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] >> cnt; } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count) +#define _m_psraw(a, count) simde_mm_sra_pi16(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n)); -#else - simde__m64 r; - const uint64_t cnt = count.u64[0]; - - if (cnt > 31) { - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); - i++) { - r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0; - } - } else if (cnt == 0) { - memcpy(&r, &a, sizeof(r)); - } else { - const uint32_t m = (uint32_t)( - (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt)); - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); - i++) { - const uint32_t is_neg = a.i32[i] < 0; - r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg); - } +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sra_pi32(a, count); +#else + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private count_ = simde__m64_to_private(count); + const int32_t cnt = + (count_.u64[0] > 31) + ? 31 + : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> cnt; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = + vshl_s32(a_.neon_i32, + vmov_n_s32(-HEDLEY_STATIC_CAST( + int32_t, vget_lane_u64(count_.neon_u64, 0)))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] >> cnt; } +#endif - return r; + return simde__m64_from_private(r_); #endif } #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count) +#define _m_psrad(a, count) simde_mm_sra_pi32(a, count) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sub_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < 8; i++) { - r.i8[i] = a.i8[i] - b.i8[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = a_.i8 - b_.i8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = a_.i8[i] - b_.i8[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b) +#define _m_psubb(a, b) simde_mm_sub_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sub_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - r.i16[i] = a.i16[i] - b.i16[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = a_.i16 - b_.i16; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] - b_.i16[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b) +#define _m_psubw(a, b) simde_mm_sub_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_sub_pi32(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int)); i++) { - r.i32[i] = a.i32[i] - b.i32[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 - b_.i32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] - b_.i32[i]; } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b) +#define _m_psubd(a, b) simde_mm_sub_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_subs_pi8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8); i++) { - if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { - r.i8[i] = INT8_MIN; - } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { - r.i8[i] = INT8_MAX; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) { + r_.i8[i] = INT8_MIN; + } else if ((b_.i8[i]) < 0 && + (a_.i8[i]) > INT8_MAX + (b_.i8[i])) { + r_.i8[i] = INT8_MAX; } else { - r.i8[i] = (a.i8[i]) - (b.i8[i]); + r_.i8[i] = (a_.i8[i]) - (b_.i8[i]); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b) +#define _m_psubsb(a, b) simde_mm_subs_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_subs_pu8(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8); i++) { - const int32_t x = a.u8[i] - b.u8[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + const int32_t x = a_.u8[i] - b_.u8[i]; if (x < 0) { - r.u8[i] = 0; + r_.u8[i] = 0; } else if (x > UINT8_MAX) { - r.u8[i] = UINT8_MAX; + r_.u8[i] = UINT8_MAX; } else { - r.u8[i] = (uint8_t)x; + r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b) +#define _m_psubusb(a, b) simde_mm_subs_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_subs_pi16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) { - if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) { - r.i16[i] = SHRT_MIN; - } else if ((b.i16[i]) < 0 && - (a.i16[i]) > INT16_MAX + (b.i16[i])) { - r.i16[i] = INT16_MAX; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) { + r_.i16[i] = SHRT_MIN; + } else if ((b_.i16[i]) < 0 && + (a_.i16[i]) > INT16_MAX + (b_.i16[i])) { + r_.i16[i] = INT16_MAX; } else { - r.i16[i] = (a.i16[i]) - (b.i16[i]); + r_.i16[i] = (a_.i16[i]) - (b_.i16[i]); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b) +#define _m_psubsw(a, b) simde_mm_subs_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_subs_pu16(a, b); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) { - const int x = a.u16[i] - b.u16[i]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + const int x = a_.u16[i] - b_.u16[i]; if (x < 0) { - r.u16[i] = 0; + r_.u16[i] = 0; } else if (x > UINT16_MAX) { - r.u16[i] = UINT16_MAX; + r_.u16[i] = UINT16_MAX; } else { - r.u16[i] = (uint16_t)x; + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); } } - return r; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b) +#define _m_psubusw(a, b) simde_mm_subs_pu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpackhi_pi8(a, b); #else - simde__m64 r; - r.i8[0] = a.i8[4]; - r.i8[1] = b.i8[4]; - r.i8[2] = a.i8[5]; - r.i8[3] = b.i8[5]; - r.i8[4] = a.i8[6]; - r.i8[5] = b.i8[6]; - r.i8[6] = a.i8[7]; - r.i8[7] = b.i8[7]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, + 7, 15); +#else + r_.i8[0] = a_.i8[4]; + r_.i8[1] = b_.i8[4]; + r_.i8[2] = a_.i8[5]; + r_.i8[3] = b_.i8[5]; + r_.i8[4] = a_.i8[6]; + r_.i8[5] = b_.i8[6]; + r_.i8[6] = a_.i8[7]; + r_.i8[7] = b_.i8[7]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b) +#define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpackhi_pi16(a, b); #else - simde__m64 r; - r.i16[0] = a.i16[2]; - r.i16[1] = b.i16[2]; - r.i16[2] = a.i16[3]; - r.i16[3] = b.i16[3]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7); +#else + r_.i16[0] = a_.i16[2]; + r_.i16[1] = b_.i16[2]; + r_.i16[2] = a_.i16[3]; + r_.i16[3] = b_.i16[3]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b) +#define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpackhi_pi32(a, b); #else - simde__m64 r; - r.i32[0] = a.i32[1]; - r.i32[1] = b.i32[1]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); +#else + r_.i32[0] = a_.i32[1]; + r_.i32[1] = b_.i32[1]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b) +#define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpacklo_pi8(a, b); #else - simde__m64 r; - r.i8[0] = a.i8[0]; - r.i8[1] = b.i8[0]; - r.i8[2] = a.i8[1]; - r.i8[3] = b.i8[1]; - r.i8[4] = a.i8[2]; - r.i8[5] = b.i8[2]; - r.i8[6] = a.i8[3]; - r.i8[7] = b.i8[3]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3, + 11); +#else + r_.i8[0] = a_.i8[0]; + r_.i8[1] = b_.i8[0]; + r_.i8[2] = a_.i8[1]; + r_.i8[3] = b_.i8[1]; + r_.i8[4] = a_.i8[2]; + r_.i8[5] = b_.i8[2]; + r_.i8[6] = a_.i8[3]; + r_.i8[7] = b_.i8[3]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b) +#define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpacklo_pi16(a, b); #else - simde__m64 r; - r.i16[0] = a.i16[0]; - r.i16[1] = b.i16[0]; - r.i16[2] = a.i16[1]; - r.i16[3] = b.i16[1]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5); +#else + r_.i16[0] = a_.i16[0]; + r_.i16[1] = b_.i16[0]; + r_.i16[2] = a_.i16[1]; + r_.i16[3] = b_.i16[1]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b) +#define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_unpacklo_pi32(a, b); #else - simde__m64 r; - r.i32[0] = a.i32[0]; - r.i32[1] = b.i32[0]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2); +#else + r_.i32[0] = a_.i32[0]; + r_.i32[1] = b_.i32[0]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b) +#define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_MMX_NATIVE) - return SIMDE__M64_C(_mm_xor_si64(a.n, b.n)); +#if defined(SIMDE_X86_MMX_NATIVE) + return _mm_xor_si64(a, b); #else - simde__m64 r; - r.i64[0] = a.i64[0] ^ b.i64[0]; - return r; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f ^ b_.i32f; +#else + r_.u64[0] = a_.u64[0] ^ b_.u64[0]; +#endif + + return simde__m64_from_private(r_); #endif } #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b) +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b) +#define _m_pxor(a, b) simde_mm_xor_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_m_to_int(simde__m64 a) { -#if defined(SIMDE_MMX_NATIVE) - return _m_to_int(a.n); +#if defined(SIMDE_X86_MMX_NATIVE) + return _m_to_int(a); +#else + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_lane_s32(a_.neon_i32, 0); #else - return a.i32[0]; + return a_.i32[0]; +#endif #endif } +#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) +#define _m_to_int(a) simde_m_to_int(a) +#endif + +SIMDE_END_DECLS_ -SIMDE__END_DECLS +HEDLEY_DIAGNOSTIC_POP -#endif /* !defined(SIMDE__MMX_H) */ +#endif /* !defined(SIMDE_X86_MMX_H) */ diff --git a/libobs/util/simde/simde-arch.h b/libobs/util/simde/simde-arch.h index 532304cda961761c19dfa7b5d5907ce0c3827856..69e302491ad119a2d4a465b2d6fecbd51fd3bd10 100644 --- a/libobs/util/simde/simde-arch.h +++ b/libobs/util/simde/simde-arch.h @@ -6,6 +6,8 @@ * details, see the Creative Commons Zero 1.0 Universal license at * * + * SPDX-License-Identifier: CC0-1.0 + * * Different compilers define different preprocessor macros for the * same architecture. This is an attempt to provide a single * interface which is usable on any compiler. @@ -53,6 +55,11 @@ #define SIMDE_ARCH_ALPHA 1 #endif #endif +#if defined(SIMDE_ARCH_ALPHA) +#define SIMDE_ARCH_ALPHA_CHECK(version) ((version) <= SIMDE_ARCH_ALPHA) +#else +#define SIMDE_ARCH_ALPHA_CHECK(version) (0) +#endif /* Atmel AVR */ @@ -64,7 +71,7 @@ */ #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \ defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64) -#define SIMDE_ARCH_AMD64 1 +#define SIMDE_ARCH_AMD64 1000 #endif /* ARM @@ -93,11 +100,30 @@ defined(_ARM) || defined(_M_ARM) || defined(_M_ARM) #define SIMDE_ARCH_ARM 1 #endif +#if defined(SIMDE_ARCH_ARM) +#define SIMDE_ARCH_ARM_CHECK(version) ((version) <= SIMDE_ARCH_ARM) +#else +#define SIMDE_ARCH_ARM_CHECK(version) (0) +#endif /* AArch64 */ #if defined(__aarch64__) || defined(_M_ARM64) -#define SIMDE_ARCH_AARCH64 10 +#define SIMDE_ARCH_AARCH64 1000 +#endif +#if defined(SIMDE_ARCH_AARCH64) +#define SIMDE_ARCH_AARCH64_CHECK(version) ((version) <= SIMDE_ARCH_AARCH64) +#else +#define SIMDE_ARCH_AARCH64_CHECK(version) (0) +#endif + +/* ARM SIMD ISA extensions */ +#if defined(__ARM_NEON) +#if defined(SIMDE_ARCH_AARCH64) +#define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_AARCH64 +#elif defined(SIMDE_ARCH_ARM) +#define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM +#endif #endif /* Blackfin @@ -128,6 +154,11 @@ #elif defined(__convex__) #define SIMDE_ARCH_CONVEX 1 #endif +#if defined(SIMDE_ARCH_CONVEX) +#define SIMDE_ARCH_CONVEX_CHECK(version) ((version) <= SIMDE_ARCH_CONVEX) +#else +#define SIMDE_ARCH_CONVEX_CHECK(version) (0) +#endif /* Adapteva Epiphany */ @@ -159,6 +190,11 @@ #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) #define SIMDE_ARCH_HPPA 1 #endif +#if defined(SIMDE_ARCH_HPPA) +#define SIMDE_ARCH_HPPA_CHECK(version) ((version) <= SIMDE_ARCH_HPPA) +#else +#define SIMDE_ARCH_HPPA_CHECK(version) (0) +#endif /* x86 */ @@ -177,6 +213,88 @@ #elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__) #define SIMDE_ARCH_X86 3 #endif +#if defined(SIMDE_ARCH_X86) +#define SIMDE_ARCH_X86_CHECK(version) ((version) <= SIMDE_ARCH_X86) +#else +#define SIMDE_ARCH_X86_CHECK(version) (0) +#endif + +/* SIMD ISA extensions for x86/x86_64 */ +#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) +#if defined(_M_IX86_FP) +#define SIMDE_ARCH_X86_MMX +#if (_M_IX86_FP >= 1) +#define SIMDE_ARCH_X86_SSE 1 +#endif +#if (_M_IX86_FP >= 2) +#define SIMDE_ARCH_X86_SSE2 1 +#endif +#elif defined(_M_X64) +#define SIMDE_ARCH_X86_SSE 1 +#define SIMDE_ARCH_X86_SSE2 1 +#else +#if defined(__MMX__) +#define SIMDE_ARCH_X86_MMX 1 +#endif +#if defined(__SSE__) +#define SIMDE_ARCH_X86_SSE 1 +#endif +#if defined(__SSE2__) +#define SIMDE_ARCH_X86_SSE2 1 +#endif +#endif +#if defined(__SSE3__) +#define SIMDE_ARCH_X86_SSE3 1 +#endif +#if defined(__SSSE3__) +#define SIMDE_ARCH_X86_SSSE3 1 +#endif +#if defined(__SSE4_1__) +#define SIMDE_ARCH_X86_SSE4_1 1 +#endif +#if defined(__SSE4_2__) +#define SIMDE_ARCH_X86_SSE4_2 1 +#endif +#if defined(__AVX__) +#define SIMDE_ARCH_X86_AVX 1 +#if !defined(SIMDE_ARCH_X86_SSE3) +#define SIMDE_ARCH_X86_SSE3 1 +#endif +#if !defined(SIMDE_ARCH_X86_SSE4_1) +#define SIMDE_ARCH_X86_SSE4_1 1 +#endif +#if !defined(SIMDE_ARCH_X86_SSE4_1) +#define SIMDE_ARCH_X86_SSE4_2 1 +#endif +#endif +#if defined(__AVX2__) +#define SIMDE_ARCH_X86_AVX2 1 +#endif +#if defined(__FMA__) +#define SIMDE_ARCH_X86_FMA 1 +#if !defined(SIMDE_ARCH_X86_AVX) +#define SIMDE_ARCH_X86_AVX 1 +#endif +#endif +#if defined(__AVX512BW__) +#define SIMDE_ARCH_X86_AVX512BW 1 +#endif +#if defined(__AVX512CD__) +#define SIMDE_ARCH_X86_AVX512CD 1 +#endif +#if defined(__AVX512DQ__) +#define SIMDE_ARCH_X86_AVX512DQ 1 +#endif +#if defined(__AVX512F__) +#define SIMDE_ARCH_X86_AVX512F 1 +#endif +#if defined(__AVX512VL__) +#define SIMDE_ARCH_X86_AVX512VL 1 +#endif +#if defined(__GFNI__) +#define SIMDE_ARCH_X86_GFNI 1 +#endif +#endif /* Itanium */ @@ -206,6 +324,11 @@ #elif defined(__mc68000__) || defined(__MC68000__) #define SIMDE_ARCH_M68K 68000 #endif +#if defined(SIMDE_ARCH_M68K) +#define SIMDE_ARCH_M68K_CHECK(version) ((version) <= SIMDE_ARCH_M68K) +#else +#define SIMDE_ARCH_M68K_CHECK(version) (0) +#endif /* Xilinx MicroBlaze */ @@ -234,6 +357,11 @@ #elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__) #define SIMDE_ARCH_MIPS 1 #endif +#if defined(SIMDE_ARCH_MIPS) +#define SIMDE_ARCH_MIPS_CHECK(version) ((version) <= SIMDE_ARCH_MIPS) +#else +#define SIMDE_ARCH_MIPS_CHECK(version) (0) +#endif /* Matsushita MN10300 */ @@ -245,6 +373,8 @@ */ #if defined(_M_PPC) #define SIMDE_ARCH_POWER _M_PPC +#elif defined(_ARCH_PWR9) +#define SIMDE_ARCH_POWER 900 #elif defined(_ARCH_PWR8) #define SIMDE_ARCH_POWER 800 #elif defined(_ARCH_PWR7) @@ -274,6 +404,20 @@ defined(__ppc) #define SIMDE_ARCH_POWER 1 #endif +#if defined(SIMDE_ARCH_POWER) +#define SIMDE_ARCH_POWER_CHECK(version) ((version) <= SIMDE_ARCH_POWER) +#else +#define SIMDE_ARCH_POWER_CHECK(version) (0) +#endif + +#if defined(__ALTIVEC__) +#define SIMDE_ARCH_POWER_ALTIVEC SIMDE_ARCH_POWER +#endif +#if defined(SIMDE_ARCH_POWER) +#define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) ((version) <= SIMDE_ARCH_POWER) +#else +#define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) +#endif /* SPARC */ @@ -298,6 +442,11 @@ #elif defined(__sparc__) || defined(__sparc) #define SIMDE_ARCH_SPARC 1 #endif +#if defined(SIMDE_ARCH_SPARC) +#define SIMDE_ARCH_SPARC_CHECK(version) ((version) <= SIMDE_ARCH_SPARC) +#else +#define SIMDE_ARCH_SPARC_CHECK(version) (0) +#endif /* SuperH */ @@ -345,6 +494,20 @@ #elif defined(_TMS320C28X) || defined(__TMS320C28X__) #define SIMDE_ARCH_TMS320 280 #endif +#if defined(SIMDE_ARCH_TMS320) +#define SIMDE_ARCH_TMS320_CHECK(version) ((version) <= SIMDE_ARCH_TMS320) +#else +#define SIMDE_ARCH_TMS320_CHECK(version) (0) +#endif + +/* WebAssembly */ +#if defined(__wasm__) +#define SIMDE_ARCH_WASM 1 +#endif + +#if defined(SIMDE_ARCH_WASM) && defined(__wasm_simd128__) +#define SIMDE_ARCH_WASM_SIMD128 +#endif /* Xtensa */ diff --git a/libobs/util/simde/simde-common.h b/libobs/util/simde/simde-common.h index 7279d54ac6e21fd2c1aa68e67cfbceb2d84c4205..3e799cde1dae91d1e99e7ba99611bffb193007fc 100644 --- a/libobs/util/simde/simde-common.h +++ b/libobs/util/simde/simde-common.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2017-2019 Evan Nemerson +/* SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -19,39 +19,254 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. + * + * Copyright: + * 2017-2020 Evan Nemerson */ #if !defined(SIMDE_COMMON_H) #define SIMDE_COMMON_H #include "hedley.h" -#include "check.h" + +#define SIMDE_VERSION_MAJOR 0 +#define SIMDE_VERSION_MINOR 5 +#define SIMDE_VERSION_MICRO 0 +#define SIMDE_VERSION \ + HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, \ + SIMDE_VERSION_MICRO) + #include "simde-arch.h" +#include "simde-features.h" +#include "simde-diagnostic.h" -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) -#define SIMDE_ALIGN(alignment) _Alignas(alignment) -#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) -#define SIMDE_ALIGN(alignment) alignas(alignment) -#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \ - HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) || \ - HEDLEY_IBM_VERSION_CHECK(11, 1, 0) || \ - HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ - HEDLEY_PGI_VERSION_CHECK(19, 4, 0) || \ - HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ - HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) || \ +#include +#include + +#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) || \ + HEDLEY_IBM_VERSION_CHECK(11, 1, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_PGI_VERSION_CHECK(19, 4, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) || \ HEDLEY_TI_VERSION_CHECK(8, 1, 0) #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) -#elif defined(_MSC_VER) && (!defined(_M_IX86) || defined(_M_AMD64)) +#elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64)) #define SIMDE_ALIGN(alignment) __declspec(align(alignment)) +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +#define SIMDE_ALIGN(alignment) _Alignas(alignment) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) +#define SIMDE_ALIGN(alignment) alignas(alignment) #else #define SIMDE_ALIGN(alignment) #endif -#define simde_assert_aligned(alignment, val) \ - simde_assert_int(((uintptr_t)(val)) % (alignment), ==, 0) +#if HEDLEY_GNUC_VERSION_CHECK(2, 95, 0) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(11, 1, 0) +#define SIMDE_ALIGN_OF(T) (__alignof__(T)) +#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ + HEDLEY_HAS_FEATURE(c11_alignof) +#define SIMDE_ALIGN_OF(T) (_Alignof(T)) +#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ + HEDLEY_HAS_FEATURE(cxx_alignof) +#define SIMDE_ALIGN_OF(T) (alignof(T)) +#endif + +#if defined(SIMDE_ALIGN_OF) +#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(SIMDE_ALIGN_OF(T)) +#else +#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(N) +#endif -#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size, 4, 6, 0) -#define SIMDE__ENABLE_GCC_VEC_EXT +#define simde_assert_aligned(alignment, val) \ + simde_assert_int(HEDLEY_REINTERPRET_CAST( \ + uintptr_t, HEDLEY_REINTERPRET_CAST( \ + const void *, (val))) % \ + (alignment), \ + ==, 0) + +#if HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ + HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) || \ + HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) || \ + (HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) || \ + HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) +#define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr)) +#elif defined(__cplusplus) && (__cplusplus > 201703L) +#include +#define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated()) +#endif + +/* diagnose_if + __builtin_constant_p was broken until clang 9, + * which is when __FILE_NAME__ was added. */ +#if defined(SIMDE_CHECK_CONSTANT_) && defined(__FILE_NAME__) +#define SIMDE_REQUIRE_CONSTANT(arg) \ + HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), \ + "`" #arg "' must be constant") +#else +#define SIMDE_REQUIRE_CONSTANT(arg) +#endif + +#define SIMDE_REQUIRE_RANGE(arg, min, max) \ + HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), \ + "'" #arg "' must be in [" #min ", " #max "]") + +#define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \ + SIMDE_REQUIRE_CONSTANT(arg) \ + SIMDE_REQUIRE_RANGE(arg, min, max) + +/* SIMDE_ASSUME_ALIGNED allows you to (try to) tell the compiler + * that a pointer is aligned to an `alignment`-byte boundary. */ +#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \ + HEDLEY_GCC_VERSION_CHECK(4, 7, 0) +#define SIMDE_ASSUME_ALIGNED(alignment, v) \ + HEDLEY_REINTERPRET_CAST(__typeof__(v), \ + __builtin_assume_aligned(v, alignment)) +#elif defined(__cplusplus) && (__cplusplus > 201703L) +#define SIMDE_ASSUME_ALIGNED(alignment, v) std::assume_aligned(v) +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define SIMDE_ASSUME_ALIGNED(alignment, v) \ + (__extension__({ \ + __typeof__(v) simde_assume_aligned_t_ = (v); \ + __assume_aligned(simde_assume_aligned_t_, alignment); \ + simde_assume_aligned_t_; \ + })) +#else +#define SIMDE_ASSUME_ALIGNED(alignment, v) (v) +#endif + +/* SIMDE_ALIGN_CAST allows you to convert to a type with greater + * aligment requirements without triggering a warning. */ +#if HEDLEY_HAS_WARNING("-Wcast-align") +#define SIMDE_ALIGN_CAST(T, v) \ + (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wcast-align\"") \ + T simde_r_ = HEDLEY_REINTERPRET_CAST(T, v); \ + HEDLEY_DIAGNOSTIC_POP \ + simde_r_; \ + })) +#else +#define SIMDE_ALIGN_CAST(T, v) HEDLEY_REINTERPRET_CAST(T, v) +#endif + +#if (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \ + HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) +#define SIMDE_MAY_ALIAS __attribute__((__may_alias__)) +#else +#define SIMDE_MAY_ALIAS +#endif + +/* Lots of compilers support GCC-style vector extensions, but many + don't support all the features. Define different macros depending + on support for + + * SIMDE_VECTOR - Declaring a vector. + * SIMDE_VECTOR_OPS - basic operations (binary and unary). + * SIMDE_VECTOR_SCALAR - For binary operators, the second argument + can be a scalar, in which case the result is as if that scalar + had been broadcast to all lanes of a vector. + * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for + extracting/inserting a single element.= + + SIMDE_VECTOR can be assumed if any others are defined, the + others are independent. */ +#if !defined(SIMDE_NO_VECTOR) +#if HEDLEY_GCC_VERSION_CHECK(4, 8, 0) +#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) +#define SIMDE_VECTOR_OPS +#define SIMDE_VECTOR_SCALAR +#define SIMDE_VECTOR_SUBSCRIPT +#elif HEDLEY_INTEL_VERSION_CHECK(16, 0, 0) +#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) +#define SIMDE_VECTOR_OPS +/* ICC only supports SIMDE_VECTOR_SCALAR for constants */ +#define SIMDE_VECTOR_SUBSCRIPT +#elif HEDLEY_GCC_VERSION_CHECK(4, 1, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) +#define SIMDE_VECTOR_OPS +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 12, 0) +#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) +#elif HEDLEY_HAS_ATTRIBUTE(vector_size) +#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) +#define SIMDE_VECTOR_OPS +#define SIMDE_VECTOR_SUBSCRIPT +#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) /* clang 4.0 */ +#define SIMDE_VECTOR_SCALAR +#endif +#endif + +/* GCC and clang have built-in functions to handle shuffling and + converting of vectors, but the implementations are slightly + different. This macro is just an abstraction over them. Note that + elem_size is in bits but vec_size is in bytes. */ +#if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) +HEDLEY_DIAGNOSTIC_PUSH +/* We don't care about -Wvariadic-macros; all compilers that support + * shufflevector/shuffle support them. */ +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#endif +#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) +#pragma GCC diagnostic ignored "-Wvariadic-macros" +#endif + +#if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) +#define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) \ + __builtin_shufflevector(a, b, __VA_ARGS__) +#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \ + !defined(__INTEL_COMPILER) +#define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) \ + (__extension__({ \ + int##elem_size##_t SIMDE_VECTOR(vec_size) \ + simde_shuffle_ = {__VA_ARGS__}; \ + __builtin_shuffle(a, b, simde_shuffle_); \ + })) +#endif +HEDLEY_DIAGNOSTIC_POP +#endif + +/* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT + but the code needs to be refactored a bit to take advantage. */ +#if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) +#if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || \ + HEDLEY_GCC_VERSION_CHECK(9, 0, 0) +#if HEDLEY_GCC_VERSION_CHECK(9, 0, 0) && !HEDLEY_GCC_VERSION_CHECK(9, 3, 0) +/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */ +#define SIMDE_CONVERT_VECTOR_(to, from) \ + ((to) = (__extension__({ \ + __typeof__(from) from_ = (from); \ + ((void)from_); \ + __builtin_convertvector(from_, __typeof__(to)); \ + }))) +#else +#define SIMDE_CONVERT_VECTOR_(to, from) \ + ((to) = __builtin_convertvector((from), __typeof__(to))) +#endif +#endif +#endif +#endif + +/* Since we currently require SUBSCRIPT before using a vector in a + union, we define these as dependencies of SUBSCRIPT. They are + likely to disappear in the future, once SIMDe learns how to make + use of vectors without using the union members. Do not use them + in your code unless you're okay with it breaking when SIMDe + changes. */ +#if defined(SIMDE_VECTOR_SUBSCRIPT) +#if defined(SIMDE_VECTOR_OPS) +#define SIMDE_VECTOR_SUBSCRIPT_OPS +#endif +#if defined(SIMDE_VECTOR_SCALAR) +#define SIMDE_VECTOR_SUBSCRIPT_SCALAR +#endif #endif #if !defined(SIMDE_ENABLE_OPENMP) && \ @@ -60,81 +275,197 @@ #define SIMDE_ENABLE_OPENMP #endif -#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk) +#if !defined(SIMDE_ENABLE_CILKPLUS) && \ + (defined(__cilk) || defined(HEDLEY_INTEL_VERSION)) #define SIMDE_ENABLE_CILKPLUS #endif #if defined(SIMDE_ENABLE_OPENMP) -#define SIMDE__VECTORIZE _Pragma("omp simd") -#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) -#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) -#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) +#define SIMDE_VECTORIZE _Pragma("omp simd") +#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) +#define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) +#define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) #elif defined(SIMDE_ENABLE_CILKPLUS) -#define SIMDE__VECTORIZE _Pragma("simd") -#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) -#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) -#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) -#elif defined(__INTEL_COMPILER) -#define SIMDE__VECTORIZE _Pragma("simd") -#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) -#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) -#define SIMDE__VECTORIZE_ALIGNED(a) -#elif defined(__clang__) -#define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)") -#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) -#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_ALIGNED(a) +#define SIMDE_VECTORIZE _Pragma("simd") +#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) +#define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) +#define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) +#elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION) +#define SIMDE_VECTORIZE _Pragma("clang loop vectorize(enable)") +#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) +#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_ALIGNED(a) #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0) -#define SIMDE__VECTORIZE _Pragma("GCC ivdep") -#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_ALIGNED(a) +#define SIMDE_VECTORIZE _Pragma("GCC ivdep") +#define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_ALIGNED(a) #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) -#define SIMDE__VECTORIZE _Pragma("_CRI ivdep") -#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_ALIGNED(a) +#define SIMDE_VECTORIZE _Pragma("_CRI ivdep") +#define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_ALIGNED(a) #else -#define SIMDE__VECTORIZE -#define SIMDE__VECTORIZE_SAFELEN(l) -#define SIMDE__VECTORIZE_REDUCTION(r) -#define SIMDE__VECTORIZE_ALIGNED(a) +#define SIMDE_VECTORIZE +#define SIMDE_VECTORIZE_SAFELEN(l) +#define SIMDE_VECTORIZE_REDUCTION(r) +#define SIMDE_VECTORIZE_ALIGNED(a) #endif -#if HEDLEY_GCC_HAS_ATTRIBUTE(unused, 3, 1, 0) -#define SIMDE__UNUSED __attribute__((__unused__)) +#define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask))) + +/* Intended for checking coverage, you should never use this in + production. */ +#if defined(SIMDE_NO_INLINE) +#define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static #else -#define SIMDE__UNUSED +#define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static #endif -#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial, 4, 3, 0) -#define SIMDE__ARTIFICIAL __attribute__((__artificial__)) +#if HEDLEY_HAS_ATTRIBUTE(unused) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) +#define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__)) #else -#define SIMDE__ARTIFICIAL +#define SIMDE_FUNCTION_POSSIBLY_UNUSED_ #endif -/* Intended for checking coverage, you should never use this in - production. */ -#if defined(SIMDE_NO_INLINE) -#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static +#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused") +#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \ + _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"") #else -#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static +#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED #endif #if defined(_MSC_VER) -#define SIMDE__BEGIN_DECLS \ +#define SIMDE_BEGIN_DECLS_ \ HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable : 4996 4204)) \ HEDLEY_BEGIN_C_DECLS -#define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS +#define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS +#else +#define SIMDE_BEGIN_DECLS_ \ + HEDLEY_DIAGNOSTIC_PUSH \ + SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \ + HEDLEY_BEGIN_C_DECLS +#define SIMDE_END_DECLS_ \ + HEDLEY_END_C_DECLS \ + HEDLEY_DIAGNOSTIC_POP +#endif + +#if HEDLEY_HAS_WARNING("-Wpedantic") +#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \ + _Pragma("clang diagnostic ignored \"-Wpedantic\"") +#elif defined(HEDLEY_GCC_VERSION) +#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \ + _Pragma("GCC diagnostic ignored \"-Wpedantic\"") #else -#define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS -#define SIMDE__END_DECLS HEDLEY_END_C_DECLS +#define SIMDE_DIAGNOSTIC_DISABLE_INT128 #endif #if defined(__SIZEOF_INT128__) -#define SIMDE__HAVE_INT128 +#define SIMDE_HAVE_INT128_ +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_INT128 typedef __int128 simde_int128; typedef unsigned __int128 simde_uint128; +HEDLEY_DIAGNOSTIC_POP +#endif + +#if !defined(SIMDE_ENDIAN_LITTLE) +#define SIMDE_ENDIAN_LITTLE 1234 +#endif +#if !defined(SIMDE_ENDIAN_BIG) +#define SIMDE_ENDIAN_BIG 4321 +#endif + +#if !defined(SIMDE_ENDIAN_ORDER) +/* GCC (and compilers masquerading as GCC) define __BYTE_ORDER__. */ +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +/* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */ +#elif defined(_BIG_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +#elif defined(_LITTLE_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +/* We know the endianness of some common architectures. Common + * architectures not listed (ARM, POWER, MIPS, etc.) here are + * bi-endian. */ +#elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(__s390x__) || defined(__zarch__) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +/* Looks like we'll have to rely on the platform. If we're missing a + * platform, please let us know. */ +#elif defined(_WIN32) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(sun) || defined(__sun) /* Solaris */ +#include +#if defined(_LITTLE_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(_BIG_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +#endif +#elif defined(__APPLE__) +#include +#if defined(__LITTLE_ENDIAN__) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(__BIG_ENDIAN__) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +#endif +#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__bsdi__) || defined(__DragonFly__) || defined(BSD) +#include +#if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +#endif +#elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__) +#include +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ + (__BYTE_ORDER == __LITTLE_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE +#elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \ + (__BYTE_ORDER == __BIG_ENDIAN) +#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG +#endif +#endif +#endif + +#if HEDLEY_HAS_BUILTIN(__builtin_bswap64) || \ + HEDLEY_GCC_VERSION_CHECK(4, 3, 0) || \ + HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \ + HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define simde_bswap64(v) __builtin_bswap64(v) +#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) +#define simde_bswap64(v) _byteswap_uint64(v) +#else +SIMDE_FUNCTION_ATTRIBUTES +uint64_t simde_bswap64(uint64_t v) +{ + return ((v & (((uint64_t)0xff) << 56)) >> 56) | + ((v & (((uint64_t)0xff) << 48)) >> 40) | + ((v & (((uint64_t)0xff) << 40)) >> 24) | + ((v & (((uint64_t)0xff) << 32)) >> 8) | + ((v & (((uint64_t)0xff) << 24)) << 8) | + ((v & (((uint64_t)0xff) << 16)) << 24) | + ((v & (((uint64_t)0xff) << 8)) << 40) | + ((v & (((uint64_t)0xff))) << 56); +} +#endif + +#if !defined(SIMDE_ENDIAN_ORDER) +#error Unknown byte order; please file a bug +#else +#if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE +#define simde_endian_bswap64_be(value) simde_bswap64(value) +#define simde_endian_bswap64_le(value) (value) +#elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG +#define simde_endian_bswap64_be(value) (value) +#define simde_endian_bswap64_le(value) simde_bswap64(value) +#endif #endif /* TODO: we should at least make an attempt to detect the correct @@ -148,8 +479,6 @@ typedef unsigned __int128 simde_uint128; #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE)value) #endif typedef SIMDE_FLOAT32_TYPE simde_float32; -HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4, - "Unable to find 32-bit floating-point type."); #if !defined(SIMDE_FLOAT64_TYPE) #define SIMDE_FLOAT64_TYPE double @@ -158,8 +487,6 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4, #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE)value) #endif typedef SIMDE_FLOAT64_TYPE simde_float64; -HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, - "Unable to find 64-bit floating-point type."); /* Whether to assume that the compiler can auto-vectorize reasonably well. This will cause SIMDe to attempt to compose vector @@ -189,67 +516,171 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, #if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \ !defined(SIMDE_ASSUME_VECTORIZATION) #if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \ - defined(__ALTIVEC__) + defined(__ALTIVEC__) || defined(__wasm_simd128__) #define SIMDE_ASSUME_VECTORIZATION #endif #endif -/* GCC and clang have built-in functions to handle shuffling of - vectors, but the implementations are slightly different. This - macro is just an abstraction over them. Note that elem_size is in - bits but vec_size is in bytes. */ -#if HEDLEY_CLANG_HAS_BUILTIN(__builtin_shufflevector) -#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \ - __builtin_shufflevector(a, b, __VA_ARGS__) -#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \ - !defined(__INTEL_COMPILER) -#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \ - __builtin_shuffle(a, b, \ - (int##elem_size##_t __attribute__( \ - (__vector_size__(vec_size)))){__VA_ARGS__}) +#if HEDLEY_HAS_WARNING("-Wbad-function-cast") +#define SIMDE_CONVERT_FTOI(T, v) \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \ + HEDLEY_STATIC_CAST(T, (v)) HEDLEY_DIAGNOSTIC_POP +#else +#define SIMDE_CONVERT_FTOI(T, v) ((T)(v)) +#endif + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +#define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \ + (_Generic((value), to : (value), from : ((to)(value)))) +#define SIMDE_CHECKED_STATIC_CAST(to, from, value) \ + (_Generic((value), to : (value), from : ((to)(value)))) +#else +#define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \ + HEDLEY_REINTERPRET_CAST(to, value) +#define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value) #endif -/* Some algorithms are iterative, and fewer iterations means less - accuracy. Lower values here will result in faster, but less - accurate, calculations for some functions. */ -#if !defined(SIMDE_ACCURACY_ITERS) -#define SIMDE_ACCURACY_ITERS 2 +#if HEDLEY_HAS_WARNING("-Wfloat-equal") +#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL \ + _Pragma("clang diagnostic ignored \"-Wfloat-equal\"") +#elif HEDLEY_GCC_VERSION_CHECK(3, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL \ + _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL #endif -/* This will probably move into Hedley at some point, but I'd like to - more thoroughly check for other compilers which define __GNUC__ - first. */ -#if defined(SIMDE__REALLY_GCC) -#undef SIMDE__REALLY_GCC +/* Some functions can trade accuracy for speed. For those functions + you can control the trade-off using this macro. Possible values: + + 0: prefer speed + 1: reasonable trade-offs + 2: prefer accuracy */ +#if !defined(SIMDE_ACCURACY_PREFERENCE) +#define SIMDE_ACCURACY_PREFERENCE 1 #endif -#if !defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) -#define SIMDE__REALLY_GCC 0 + +#if defined(__STDC_HOSTED__) +#define SIMDE_STDC_HOSTED __STDC_HOSTED__ +#else +#if defined(HEDLEY_PGI_VERSION_CHECK) || defined(HEDLEY_MSVC_VERSION_CHECK) +#define SIMDE_STDC_HOSTED 1 #else -#define SIMDE__REALLY_GCC 1 +#define SIMDE_STDC_HOSTED 0 +#endif #endif -#if defined(SIMDE__ASSUME_ALIGNED) -#undef SIMDE__ASSUME_ALIGNED +/* Try to deal with environments without a standard library. */ +#if !defined(simde_memcpy) || !defined(simde_memset) +#if !defined(SIMDE_NO_STRING_H) && defined(__has_include) +#if __has_include() +#include +#if !defined(simde_memcpy) +#define simde_memcpy(dest, src, n) memcpy(dest, src, n) +#endif +#if !defined(simde_memset) +#define simde_memset(s, c, n) memset(s, c, n) #endif -#if HEDLEY_INTEL_VERSION_CHECK(9, 0, 0) -#define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align) -#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) -#define SIMDE__ASSUME_ALIGNED(ptr, align) \ - __assume((((char *)ptr) - ((char *)0)) % (align) == 0) -#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned, 4, 7, 0) -#define SIMDE__ASSUME_ALIGNED(ptr, align) \ - (ptr = (__typeof__(ptr))__builtin_assume_aligned((ptr), align)) -#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume) -#define SIMDE__ASSUME_ALIGNED(ptr, align) \ - __builtin_assume((((char *)ptr) - ((char *)0)) % (align) == 0) -#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable, 4, 5, 0) -#define SIMDE__ASSUME_ALIGNED(ptr, align) \ - ((((char *)ptr) - ((char *)0)) % (align) == 0) \ - ? (1) \ - : (__builtin_unreachable(), 0) #else -#define SIMDE__ASSUME_ALIGNED(ptr, align) +#define SIMDE_NO_STRING_H +#endif +#endif +#endif +#if !defined(simde_memcpy) || !defined(simde_memset) +#if !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1) +#include +#if !defined(simde_memcpy) +#define simde_memcpy(dest, src, n) memcpy(dest, src, n) #endif +#if !defined(simde_memset) +#define simde_memset(s, c, n) memset(s, c, n) +#endif +#elif (HEDLEY_HAS_BUILTIN(__builtin_memcpy) && \ + HEDLEY_HAS_BUILTIN(__builtin_memset)) || \ + HEDLEY_GCC_VERSION_CHECK(4, 2, 0) +#if !defined(simde_memcpy) +#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n) +#endif +#if !defined(simde_memset) +#define simde_memset(s, c, n) __builtin_memset(s, c, n) +#endif +#else +/* These are meant to be portable, not fast. If you're hitting them you + * should think about providing your own (by defining the simde_memcpy + * macro prior to including any SIMDe files) or submitting a patch to + * SIMDe so we can detect your system-provided memcpy/memset, like by + * adding your compiler to the checks for __builtin_memcpy and/or + * __builtin_memset. */ +#if !defined(simde_memcpy) +SIMDE_FUNCTION_ATTRIBUTES +void simde_memcpy_(void *dest, const void *src, size_t len) +{ + char *dest_ = HEDLEY_STATIC_CAST(char *, dest); + char *src_ = HEDLEY_STATIC_CAST(const char *, src); + for (size_t i = 0; i < len; i++) { + dest_[i] = src_[i]; + } +} +#define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n) +#endif + +#if !defined(simde_memset) +SIMDE_FUNCTION_ATTRIBUTES +void simde_memset_(void *s, int c, size_t len) +{ + char *s_ = HEDLEY_STATIC_CAST(char *, s); + char c_ = HEDLEY_STATIC_CAST(char, c); + for (size_t i = 0; i < len; i++) { + s_[i] = c_[i]; + } +} +#define simde_memset(s, c, n) simde_memset_(s, c, n) +#endif +#endif /* !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1) */ +#endif /* !defined(simde_memcpy) || !defined(simde_memset) */ + +#include "simde-math.h" + +#if defined(FE_ALL_EXCEPT) +#define SIMDE_HAVE_FENV_H +#elif defined(__has_include) +#if __has_include() +#include +#define SIMDE_HAVE_FENV_H +#endif +#elif SIMDE_STDC_HOSTED == 1 +#include +#define SIMDE_HAVE_FENV_H +#endif + +#if defined(EXIT_FAILURE) +#define SIMDE_HAVE_STDLIB_H +#elif defined(__has_include) +#if __has_include() +#include +#define SIMDE_HAVE_STDLIB_H +#endif +#elif SIMDE_STDC_HOSTED == 1 +#include +#define SIMDE_HAVE_STDLIB_H +#endif + +#if defined(__has_include) +#if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() +#include +#elif __has_include() +#include +#endif +#if __has_include() +#include +#endif +#elif SIMDE_STDC_HOSTED == 1 +#include +#include +#endif + +#include "check.h" /* Sometimes we run into problems with specific versions of compilers which make the native versions unusable for us. Often this is due @@ -258,7 +689,7 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, start only defining them for problematic compiler versions. */ #if !defined(SIMDE_IGNORE_COMPILER_BUGS) -#if SIMDE__REALLY_GCC +#if defined(HEDLEY_GCC_VERSION) #if !HEDLEY_GCC_VERSION_CHECK(4, 9, 0) #define SIMDE_BUG_GCC_REV_208793 #endif @@ -268,11 +699,53 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8, #if !HEDLEY_GCC_VERSION_CHECK(4, 6, 0) #define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ #endif +#if !HEDLEY_GCC_VERSION_CHECK(8, 0, 0) +#define SIMDE_BUG_GCC_REV_247851 +#endif +#if !HEDLEY_GCC_VERSION_CHECK(10, 0, 0) +#define SIMDE_BUG_GCC_REV_274313 +#define SIMDE_BUG_GCC_91341 +#endif +#if !HEDLEY_GCC_VERSION_CHECK(9, 0, 0) && defined(SIMDE_ARCH_AARCH64) +#define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR +#endif +#if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) +#define SIMDE_BUG_GCC_94482 +#endif +#if !HEDLEY_GCC_VERSION_CHECK(9, 4, 0) && defined(SIMDE_ARCH_AARCH64) +#define SIMDE_BUG_GCC_94488 #endif -#if defined(__EMSCRIPTEN__) +#if defined(SIMDE_ARCH_POWER) +#define SIMDE_BUG_GCC_95227 +#endif +#define SIMDE_BUG_GCC_95399 +#elif defined(__clang__) +#if defined(SIMDE_ARCH_AARCH64) +#define SIMDE_BUG_CLANG_45541 +#endif +#endif +#if defined(HEDLEY_EMSCRIPTEN_VERSION) #define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */ #define SIMDE_BUG_EMSCRIPTEN_5242 #endif #endif +/* GCC and Clang both have the same issue: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144 + * https://bugs.llvm.org/show_bug.cgi?id=45931 + */ +#if HEDLEY_HAS_WARNING("-Wsign-conversion") || HEDLEY_GCC_VERSION_CHECK(4, 3, 0) +#define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) \ + (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_POP \ + _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") __typeof__( \ + expr) simde_bug_ignore_sign_conversion_v_ = (expr); \ + HEDLEY_DIAGNOSTIC_PUSH \ + simde_bug_ignore_sign_conversion_v_; \ + })) +#else +#define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr) +#endif + #endif /* !defined(SIMDE_COMMON_H) */ diff --git a/libobs/util/simde/simde-diagnostic.h b/libobs/util/simde/simde-diagnostic.h new file mode 100644 index 0000000000000000000000000000000000000000..d1b9b2634058f866c83bfd7744a5e90b9626df1b --- /dev/null +++ b/libobs/util/simde/simde-diagnostic.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2017-2020 Evan Nemerson + */ + +/* SIMDe targets a very wide range of standards and compilers, and our + * goal is to compile cleanly even with extremely aggressive warnings + * (i.e., -Weverything in clang, -Wextra in GCC, /W4 for MSVC, etc.) + * treated as errors. + * + * While our preference is to resolve the underlying issue a given + * diagnostic is warning us about, sometimes that's not possible. + * Fixing a warning in one compiler may cause problems in another. + * Sometimes a warning doesn't really apply to us (false positives), + * and sometimes adhering to a warning would mean dropping a feature + * we *know* the compiler supports since we have tested specifically + * for the compiler or feature. + * + * When practical, warnings are only disabled for specific code. For + * a list of warnings which are enabled by default in all SIMDe code, + * see SIMDE_DISABLE_UNWANTED_DIAGNOSTICS. Note that we restore the + * warning stack when SIMDe is done parsing, so code which includes + * SIMDe is not deprived of these warnings. + */ + +#if !defined(SIMDE_DIAGNOSTIC_H) + +#include "hedley.h" + +/* This is only to help us implement functions like _mm_undefined_ps. */ +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +#undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif +#if HEDLEY_HAS_WARNING("-Wuninitialized") +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + _Pragma("clang diagnostic ignored \"-Wuninitialized\"") +#elif HEDLEY_GCC_VERSION_CHECK(4, 2, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + _Pragma("GCC diagnostic ignored \"-Wuninitialized\"") +#elif HEDLEY_PGI_VERSION_CHECK(19, 10, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)") +#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 12, 0) && defined(__cplusplus) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + _Pragma("error_messages(off,unassigned)") +#elif HEDLEY_TI_VERSION_CHECK(16, 9, 9) || \ + HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || \ + HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \ + HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551") +#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)") +#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) && !defined(__MSVC_RUNTIME_CHECKS) +#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \ + __pragma(warning(disable : 4700)) +#endif + +/* GCC emits a lot of "notes" about the ABI being different for things + * in newer versions of GCC. We don't really care because all our + * functions are inlined and don't generate ABI. */ +#if HEDLEY_GCC_VERSION_CHECK(7, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \ + _Pragma("GCC diagnostic ignored \"-Wpsabi\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ +#endif + +/* Since MMX uses x87 FP registers, you're supposed to call _mm_empty() + * after each MMX function before any floating point instructions. + * Some compilers warn about functions which use MMX functions but + * don't call _mm_empty(). However, since SIMDe is implementyng the + * MMX API we shouldn't be calling _mm_empty(); we leave it to the + * caller to invoke simde_mm_empty(). */ +#if HEDLEY_INTEL_VERSION_CHECK(19, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ + _Pragma("warning(disable:13200 13203)") +#elif defined(HEDLEY_MSVC_VERSION) +#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ + __pragma(warning(disable : 4799)) +#else +#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ +#endif + +/* Intel is pushing people to use OpenMP SIMD instead of Cilk+, so they + * emit a diagnostic if you use #pragma simd instead of + * #pragma omp simd. SIMDe supports OpenMP SIMD, you just need to + * compile with -qopenmp or -qopenmp-simd and define + * SIMDE_ENABLE_OPENMP. Cilk+ is just a fallback. */ +#if HEDLEY_INTEL_VERSION_CHECK(18, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \ + _Pragma("warning(disable:3948)") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ +#endif + +#if defined(HEDLEY_MSVC_VERSION) +#define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \ + __pragma(warning(disable : 4204)) +#else +#define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ +#endif + +/* This warning needs a lot of work. It is triggered if all you do is + * pass the value to memcpy/__builtin_memcpy, or if you initialize a + * member of the union, even if that member takes up the entire union. + * Last tested with clang-10, hopefully things will improve in the + * future; if clang fixes this I'd love to enable it. */ +#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized") +#define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \ + _Pragma("clang diagnostic ignored \"-Wconditional-uninitialized\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ +#endif + +/* This warning is meant to catch things like `0.3 + 0.4 == 0.7`, which + * will is false. However, SIMDe uses these operations exclusively + * for things like _mm_cmpeq_ps, for which we really do want to check + * for equality (or inequality). + * + * If someone wants to put together a SIMDE_FLOAT_EQUAL(a, op, b) macro + * which just wraps a check in some code do disable this diagnostic I'd + * be happy to accept it. */ +#if HEDLEY_HAS_WARNING("-Wfloat-equal") || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \ + _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ +#endif + +/* This is because we use HEDLEY_STATIC_ASSERT for static assertions. + * If Hedley can't find an implementation it will preprocess to + * nothing, which means there will be a trailing semi-colon. */ +#if HEDLEY_HAS_WARNING("-Wextra-semi") +#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ + _Pragma("clang diagnostic ignored \"-Wextra-semi\"") +#elif HEDLEY_GCC_VERSION_CHECK(8, 1, 0) && defined(__cplusplus) +#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ + _Pragma("GCC diagnostic ignored \"-Wextra-semi\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ +#endif + +/* We do use a few variadic macros, which technically aren't available + * until C99 and C++11, but every compiler I'm aware of has supported + * them for much longer. That said, usage is isolated to the test + * suite and compilers known to support them. */ +#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") +#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \ + _Pragma("clang diagnostic ignored \"-Wvariadic-macros\"") _Pragma( \ + "clang diagnostic ignored \"-Wc++98-compat-pedantic\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \ + _Pragma("GCC diagnostic ignored \"-Wvariadic-macros\"") +#endif +#else +#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ +#endif + +/* Triggered when assigning a float to a double implicitly. We use + * explicit casts in SIMDe, this is only used in the test suite. */ +#if HEDLEY_HAS_WARNING("-Wdouble-promotion") +#define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ \ + _Pragma("clang diagnostic ignored \"-Wdouble-promotion\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +#endif + +/* Several compilers treat conformant array parameters as VLAs. We + * test to make sure we're in C mode (C++ doesn't support CAPs), and + * that the version of the standard supports CAPs. We also blacklist + * some buggy compilers like MSVC (the logic is in Hedley if you want + * to take a look), but with certain warnings enabled some compilers + * still like to emit a diagnostic. */ +#if HEDLEY_HAS_WARNING("-Wvla") +#define SIMDE_DIAGNOSTIC_DISABLE_VLA_ \ + _Pragma("clang diagnostic ignored \"-Wvla\"") +#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_VLA_ \ + _Pragma("GCC diagnostic ignored \"-Wvla\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_VLA_ +#endif + +#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused") +#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ + _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ +#endif + +#if HEDLEY_HAS_WARNING("-Wunused-function") +#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \ + _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#elif HEDLEY_GCC_VERSION_CHECK(3, 4, 0) +#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \ + _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ +#endif + +#if HEDLEY_HAS_WARNING("-Wpass-failed") +#define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \ + _Pragma("clang diagnostic ignored \"-Wpass-failed\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ +#endif + +/* https://github.com/nemequ/simde/issues/277 */ +#if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4, 6, 0) && \ + !HEDLEY_GCC_VERSION_CHECK(6, 0, 0) && defined(__cplusplus) +#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE \ + _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE +#endif + +/* Some compilers, such as clang, may use `long long` for 64-bit + * integers, but `long long` triggers a diagnostic with + * -Wc++98-compat-pedantic which says 'long long' is incompatible with + * C++98. */ +#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") +#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC \ + _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") +#else +#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC +#endif + +#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS \ + SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \ + SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ + SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \ + SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \ + SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \ + SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \ + SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ + SIMDE_DIAGNOSTIC_DISABLE_VLA_ \ + SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ + SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \ + SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \ + SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC \ + SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE + +#endif diff --git a/libobs/util/simde/simde-features.h b/libobs/util/simde/simde-features.h new file mode 100644 index 0000000000000000000000000000000000000000..61ab6365f6f82783c463934cec66520c60d3e9ce --- /dev/null +++ b/libobs/util/simde/simde-features.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + */ + +/* simde-arch.h is used to determine which features are available according + to the compiler. However, we want to make it possible to forcibly enable + or disable APIs */ + +#if !defined(SIMDE_FEATURES_H) +#define SIMDE_FEATURES_H + +#include "simde-arch.h" + +#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SVML) +#define SIMDE_X86_SVML_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) +#define SIMDE_X86_AVX512F_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX512CD_NATIVE) && \ + !defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX512CD) +#define SIMDE_X86_AVX512CD_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) +#define SIMDE_X86_AVX512F_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX512DQ_NATIVE) && \ + !defined(SIMDE_X86_AVX512DQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX512DQ) +#define SIMDE_X86_AVX512DQ_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) +#define SIMDE_X86_AVX512F_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX512VL_NATIVE) && \ + !defined(SIMDE_X86_AVX512VL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX512VL) +#define SIMDE_X86_AVX512VL_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) +#define SIMDE_X86_AVX512F_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX512BW_NATIVE) && \ + !defined(SIMDE_X86_AVX512BW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX512BW) +#define SIMDE_X86_AVX512BW_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) +#define SIMDE_X86_AVX512F_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX512F_NATIVE) && \ + !defined(SIMDE_X86_AVX512F_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX512F) +#define SIMDE_X86_AVX512F_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX2_NATIVE) +#define SIMDE_X86_AVX2_NATIVE +#endif + +#if !defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_FMA_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_FMA) +#define SIMDE_X86_FMA_NATIVE +#endif +#endif +#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) +#define SIMDE_X86_AVX_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX2_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX2) +#define SIMDE_X86_AVX2_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) +#define SIMDE_X86_AVX_NATIVE +#endif + +#if !defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_AVX) +#define SIMDE_X86_AVX_NATIVE +#endif +#endif +#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) +#define SIMDE_X86_SSE4_2_NATIVE +#endif + +#if !defined(SIMDE_X86_SSE4_2_NATIVE) && \ + !defined(SIMDE_X86_SSE4_2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSE4_2) +#define SIMDE_X86_SSE4_2_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) +#define SIMDE_X86_SSE4_1_NATIVE +#endif + +#if !defined(SIMDE_X86_SSE4_1_NATIVE) && \ + !defined(SIMDE_X86_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSE4_1) +#define SIMDE_X86_SSE4_1_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSSE3_NATIVE) +#define SIMDE_X86_SSSE3_NATIVE +#endif + +#if !defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSSE3_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSSE3) +#define SIMDE_X86_SSSE3_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NATIVE) +#define SIMDE_X86_SSE3_NATIVE +#endif + +#if !defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSE3) +#define SIMDE_X86_SSE3_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) +#define SIMDE_X86_SSE2_NATIVE +#endif + +#if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSE2) +#define SIMDE_X86_SSE2_NATIVE +#endif +#endif +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE_NATIVE) +#define SIMDE_X86_SSE_NATIVE +#endif + +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_SSE_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_SSE) +#define SIMDE_X86_SSE_NATIVE +#endif +#endif + +#if !defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_X86_MMX_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_MMX) +#define SIMDE_X86_MMX_NATIVE +#endif +#endif + +#if !defined(SIMDE_X86_GFNI_NATIVE) && !defined(SIMDE_X86_GFNI_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_X86_GFNI) +#define SIMDE_X86_GFNI_NATIVE +#endif +#endif + +#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if defined(__INTEL_COMPILER) +#define SIMDE_X86_SVML_NATIVE +#endif +#endif + +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning(push) +#pragma warning(disable : 4799) +#endif + +#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || \ + defined(SIMDE_X86_SVML_NATIVE) +#include +#elif defined(SIMDE_X86_SSE4_2_NATIVE) +#include +#elif defined(SIMDE_X86_SSE4_1_NATIVE) +#include +#elif defined(SIMDE_X86_SSSE3_NATIVE) +#include +#elif defined(SIMDE_X86_SSE3_NATIVE) +#include +#elif defined(SIMDE_X86_SSE2_NATIVE) +#include +#elif defined(SIMDE_X86_SSE_NATIVE) +#include +#elif defined(SIMDE_X86_MMX_NATIVE) +#include +#endif + +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning(pop) +#endif + +#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + !defined(SIMDE_ARM_NEON_A64V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_ARM_NEON) && defined(SIMDE_ARCH_AARCH64) && \ + SIMDE_ARCH_ARM_CHECK(80) +#define SIMDE_ARM_NEON_A64V8_NATIVE +#endif +#endif +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + !defined(SIMDE_ARM_NEON_A32V8_NATIVE) +#define SIMDE_ARM_NEON_A32V8_NATIVE +#endif + +#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80) +#define SIMDE_ARM_NEON_A32V8_NATIVE +#endif +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + !defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define SIMDE_ARM_NEON_A32V7_NATIVE +#endif + +#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + !defined(SIMDE_ARM_NEON_A32V7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(70) +#define SIMDE_ARM_NEON_A32V7_NATIVE +#endif +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#include +#endif + +#if !defined(SIMDE_WASM_SIMD128_NATIVE) && \ + !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) +#if defined(SIMDE_ARCH_WASM_SIMD128) +#define SIMDE_WASM_SIMD128_NATIVE +#endif +#endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) +#if !defined(__wasm_unimplemented_simd128__) +#define __wasm_unimplemented_simd128__ +#endif +#include +#endif + +#if !defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && \ + !defined(SIMDE_POWER_ALTIVEC_P9_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(900) +#define SIMDE_POWER_ALTIVEC_P9_NATIVE +#endif +#endif +#if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8) +#define SIMDE_POWER_ALTIVEC_P8_NATIVE +#endif + +#if !defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \ + !defined(SIMDE_POWER_ALTIVEC_P8_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(800) +#define SIMDE_POWER_ALTIVEC_P8_NATIVE +#endif +#endif +#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7) +#define SIMDE_POWER_ALTIVEC_P7_NATIVE +#endif + +#if !defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && \ + !defined(SIMDE_POWER_ALTIVEC_P7_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(700) +#define SIMDE_POWER_ALTIVEC_P7_NATIVE +#endif +#endif +#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6) +#define SIMDE_POWER_ALTIVEC_P6_NATIVE +#endif + +#if !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && \ + !defined(SIMDE_POWER_ALTIVEC_P6_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(600) +#define SIMDE_POWER_ALTIVEC_P6_NATIVE +#endif +#endif +#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5) +#define SIMDE_POWER_ALTIVEC_P5_NATIVE +#endif + +#if !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && \ + !defined(SIMDE_POWER_ALTIVEC_P5_NO_NATIVE) && \ + !defined(SIMDE_NO_NATIVE) +#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(500) +#define SIMDE_POWER_ALTIVEC_P5_NATIVE +#endif +#endif +#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +/* stdbool.h conflicts with the bool in altivec.h */ +#if defined(bool) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF_BOOL_) +#undef bool +#endif +#include +/* GCC allows you to undefine these macros to prevent conflicts with + * standard types as they become context-sensitive keywords. */ +#if defined(__cplusplus) +#if defined(vector) +#undef vector +#endif +#if defined(pixel) +#undef pixel +#endif +#if defined(bool) +#undef bool +#endif +#define SIMDE_POWER_ALTIVEC_VECTOR(T) vector T +#define SIMDE_POWER_ALTIVEC_PIXEL pixel +#define SIMDE_POWER_ALTIVEC_BOOL bool +#else +#define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T +#define SIMDE_POWER_ALTIVEC_PIXEL __pixel +#define SIMDE_POWER_ALTIVEC_BOOL __bool +#endif /* defined(__cplusplus) */ +#endif + +#endif /* !defined(SIMDE_FEATURES_H) */ diff --git a/libobs/util/simde/sse.h b/libobs/util/simde/sse.h index 6f0788124845a1af08f0f9ebf9dec709cd1e3684..8e80c23eb04401319b5d344c96c5526b50e36738 100644 --- a/libobs/util/simde/sse.h +++ b/libobs/util/simde/sse.h @@ -1,4 +1,6 @@ -/* Permission is hereby granted, free of charge, to any person +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, @@ -19,2036 +21,2793 @@ * SOFTWARE. * * Copyright: - * 2017 Evan Nemerson + * 2017-2020 Evan Nemerson * 2015-2017 John W. Ratcliff * 2015 Brandon Rowlett * 2015 Ken Fast */ -#if !defined(SIMDE__SSE_H) -#if !defined(SIMDE__SSE_H) -#define SIMDE__SSE_H -#endif +#if !defined(SIMDE_X86_SSE_H) +#define SIMDE_X86_SSE_H + #include "mmx.h" -#if defined(SIMDE_SSE_NATIVE) -#undef SIMDE_SSE_NATIVE +#if !defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES +#endif + +#if defined(_WIN32) +#include +#endif + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +typedef union { +#if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#endif + SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; +#if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN(16) simde_int128 i128[1]; + SIMDE_ALIGN(16) simde_uint128 u128[1]; +#endif + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64_private m64_private[2]; + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_X86_SSE_NATIVE) + SIMDE_ALIGN(16) __m128 n; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_ALIGN(16) float64x2_t neon_f64; +#endif +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + SIMDE_ALIGN(16) v128_t wasm_v128; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; +#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; +#endif +#endif +} simde__m128_private; + +#if defined(SIMDE_X86_SSE_NATIVE) +typedef __m128 simde__m128; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +typedef float32x4_t simde__m128; +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +typedef v128_t simde__m128; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128; +#elif defined(SIMDE_VECTOR_SUBSCRIPT) +typedef simde_float32 simde__m128 SIMDE_ALIGN(16) + SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else +typedef simde__m128_private simde__m128; +#endif + +#if !defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES +typedef simde__m128 __m128; #endif -#if defined(SIMDE_SSE_FORCE_NATIVE) -#define SIMDE_SSE_NATIVE -#elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \ - !defined(SIMDE_NO_NATIVE) -#define SIMDE_SSE_NATIVE -#elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \ - !defined(SIMDE_NO_NEON) -#define SIMDE_SSE_NEON + +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), + "simde__m128_private size incorrect"); +#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, + "simde__m128 is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, + "simde__m128_private is not 16-byte aligned"); #endif -#if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE) -#if defined(SIMDE_SSE_FORCE_NATIVE) -#error Native SSE support requires native MMX support +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde__m128_from_private(simde__m128_private v) +{ + simde__m128 r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128_private simde__m128_to_private(simde__m128 v) +{ + simde__m128_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64) +#endif +#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2, + simde_float32 e1, simde_float32 e0) +{ +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_set_ps(e3, e2, e1, e0); +#else + simde__m128_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3}; + r_.neon_f32 = vld1q_f32(data); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3); #else -#warning Native SSE support requires native MMX support, disabling -#undef SIMDE_SSE_NATIVE + r_.f32[0] = e0; + r_.f32[1] = e1; + r_.f32[2] = e2; + r_.f32[3] = e3; #endif -#elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON) -#warning SSE3 NEON support requires MMX NEON support, disabling -#undef SIMDE_SSE3_NEON + + return simde__m128_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0) #endif -#if defined(SIMDE_SSE_NATIVE) -#include +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_set_ps1(simde_float32 a) +{ +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_set_ps1(a); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_f32(a); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + (void)a; + return vec_splats(a); #else -#if defined(SIMDE_SSE_NEON) -#include + return simde_mm_set_ps(a, a, a, a); +#endif +} +#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_set_ps1(a) simde_mm_set_ps1(a) +#define _mm_set1_ps(a) simde_mm_set1_ps(a) #endif -#if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \ - (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) -#include -#elif defined(_WIN32) -#include -#endif -#endif - -#include -#include - -#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) -SIMDE__BEGIN_DECLS - -typedef SIMDE_ALIGN(16) union { -#if defined(SIMDE__ENABLE_GCC_VEC_EXT) - int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); - int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); - int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); - int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); - uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); - uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); - uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); - uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); -#if defined(SIMDE__HAVE_INT128) - simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); - simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); -#endif - simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); -#else - int8_t i8[16]; - int16_t i16[8]; - int32_t i32[4]; - int64_t i64[2]; - uint8_t u8[16]; - uint16_t u16[8]; - uint32_t u32[4]; - uint64_t u64[2]; -#if defined(SIMDE__HAVE_INT128) - simde_int128 i128[1]; - simde_uint128 u128[1]; -#endif - simde_float32 f32[4]; -#endif - -#if defined(SIMDE_SSE_NATIVE) - __m128 n; -#elif defined(SIMDE_SSE_NEON) - int8x16_t neon_i8; - int16x8_t neon_i16; - int32x4_t neon_i32; - int64x2_t neon_i64; - uint8x16_t neon_u8; - uint16x8_t neon_u16; - uint32x4_t neon_u32; - uint64x2_t neon_u64; - float32x4_t neon_f32; -#endif -} simde__m128; - -#if defined(SIMDE_SSE_NATIVE) -HEDLEY_STATIC_ASSERT(sizeof(__m128) == sizeof(simde__m128), - "__m128 size doesn't match simde__m128 size"); -SIMDE__FUNCTION_ATTRIBUTES simde__m128 SIMDE__M128_C(__m128 v) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - r.n = v; - return r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_move_ss(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = + vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) + m = {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, + 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); +#else + r_.f32[0] = b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; +#endif + + return simde__m128_from_private(r_); +#endif } -#elif defined(SIMDE_SSE_NEON) -#define SIMDE__M128_NEON_C(T, expr) \ - (simde__m128) { .neon_##T = expr } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_move_ss(a, b) simde_mm_move_ss((a), (b)) #endif -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_add_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_add_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f32 = a_.f32 + b_.f32; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i] + b.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = a_.f32[i] + b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_add_ps(a, b) simde_mm_add_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_add_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32_t b0 = vgetq_lane_f32(b.neon_f32, 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - /* the upper values in the result must be the remnants of . */ - r.neon_f32 = vaddq_f32(a.neon_f32, value); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_add_ps(a, b).f32, - 4, 1, 2, 3); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_add_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_add_ps(a, b)); #else - r.f32[0] = a.f32[0] + b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); - return r; + r_.f32[0] = a_.f32[0] + b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_add_ss(a, b) simde_mm_add_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_and_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_and_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 & b_.i32; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] & b.i32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] & b_.i32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_and_ps(a, b) simde_mm_and_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_andnot_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_andnot_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = ~a_.i32 & b_.i32; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = ~(a.i32[i]) & b.i32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = ~(a_.i32[i]) & b_.i32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b) { - simde__m64 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_avg_pu16(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < 4; i++) { - r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_avg_pu16(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \ + defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \ + defined(SIMDE_CONVERT_VECTOR_) + uint32_t wa SIMDE_VECTOR(16); + uint32_t wb SIMDE_VECTOR(16); + uint32_t wr SIMDE_VECTOR(16); + SIMDE_CONVERT_VECTOR_(wa, a_.u16); + SIMDE_CONVERT_VECTOR_(wb, b_.u16); + wr = (wa + wb + 1) >> 1; + SIMDE_CONVERT_VECTOR_(r_.u16, wr); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b) +#define _m_pavgw(a, b) simde_mm_avg_pu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b) { - simde__m64 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_avg_pu8(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < 8; i++) { - r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_avg_pu8(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \ + defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \ + defined(SIMDE_CONVERT_VECTOR_) + uint16_t wa SIMDE_VECTOR(16); + uint16_t wb SIMDE_VECTOR(16); + uint16_t wr SIMDE_VECTOR(16); + SIMDE_CONVERT_VECTOR_(wa, a_.u8); + SIMDE_CONVERT_VECTOR_(wb, b_.u8); + wr = (wa + wb + 1) >> 1; + SIMDE_CONVERT_VECTOR_(r_.u8, wr); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b) +#define _m_pavgb(a, b) simde_mm_avg_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpeq_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpeq_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpeq( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpeq_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpeq_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmpeq_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpge_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpge_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpge( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) + return _mm_cmpge_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) - r.n = _mm_cmpge_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmpge_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpgt_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpgt_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpgt( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) + return _mm_cmpgt_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) - r.n = _mm_cmpgt_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmpgt_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmple_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmple_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmple( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmple_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmple_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmple_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmplt_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmplt_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmplt( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmplt_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmplt_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmplt_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpneq_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpneq_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && \ + !defined(HEDLEY_IBM_VERSION) + /* vec_cmpne(vector float, vector float) is missing from XL C/C++ v16.1.1, + though the documentation (table 89 on page 432 of the IBM XL C/C++ for + Linux Compiler Reference, Version 16.1.1) shows that it should be + present. Both GCC and clang support it. */ + r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpne( + a_.altivec_f32, b_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) + : UINT32_C(0); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpneq_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpneq_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t e = - vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32)); - float32x4_t s = - vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e))); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmpneq_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = a.u32[i]; + r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpnge_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32); -#else - r = simde_mm_cmplt_ps(a, b); -#endif - - return r; + return simde_mm_cmplt_ps(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) - r.n = _mm_cmpnge_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#else - r = simde_mm_cmplt_ss(a, b); -#endif - - return r; + return simde_mm_cmplt_ss(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpngt_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32); -#else - r = simde_mm_cmple_ps(a, b); -#endif - - return r; + return simde_mm_cmple_ps(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) - r.n = _mm_cmpngt_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#else - r = simde_mm_cmple_ss(a, b); -#endif - - return r; + return simde_mm_cmple_ss(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpnle_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32); -#else - r = simde_mm_cmpgt_ps(a, b); -#endif - - return r; + return simde_mm_cmpgt_ps(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpnle_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t s = - vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#else - r = simde_mm_cmpgt_ss(a, b); -#endif - - return r; + return simde_mm_cmpgt_ss(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpnlt_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32); -#else - r = simde_mm_cmpge_ps(a, b); -#endif - - return r; + return simde_mm_cmpge_ps(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpnlt_ss(a.n, b.n); -#else - r = simde_mm_cmpge_ss(a, b); -#endif - - return r; + return simde_mm_cmpge_ss(a, b); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpord_ps(a, b); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpord_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) /* Note: NEON does not have ordered compare builtin Need to compare a eq a and b eq b to check for NaN Do AND of results to get final */ - uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); - r.neon_u32 = vandq_u32(ceqaa, ceqbb); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0 - : 0xffffffff; + uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); + r_.neon_u32 = vandq_u32(ceqaa, ceqbb); +#elif defined(simde_math_isnanf) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || + simde_math_isnanf(b_.f32[i])) + ? UINT32_C(0) + : ~UINT32_C(0); } +#else + HEDLEY_UNREACHABLE(); #endif - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b) -{ - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpord_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32); - float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb)); - float32x4_t t = vextq_f32(a.neon_f32, s, 1); - r.neon_f32 = vextq_f32(t, t, 3); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, - simde_mm_cmpord_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i]; - } + return simde__m128_from_private(r_); #endif - - return r; } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cmpunord_ps(a.n, b.n); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff - : 0; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpunord_ps(a, b); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); + r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb)); +#elif defined(simde_math_isnanf) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || + simde_math_isnanf(b_.f32[i])) + ? ~UINT32_C(0) + : UINT32_C(0); } +#else + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) - r.n = _mm_cmpunord_ss(a.n, b.n); -#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) - r.f32 = SIMDE__SHUFFLE_VECTOR( - 32, 16, a.f32, simde_mm_cmpunord_ps(a, b).f32, 4, 1, 2, 3); -#else - r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i]; +#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) + return _mm_cmpunord_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(simde_math_isnanf) + r_.u32[0] = + (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) + ? ~UINT32_C(0) + : UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i]; } +#else + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comieq_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comieq_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comieq_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0; + uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); + return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); #else - return a.f32[0] == b.f32[0]; + return a_.f32[0] == b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comige_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comige_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comige_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 - : 0; + uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); + return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); #else - return a.f32[0] >= b.f32[0]; + return a_.f32[0] >= b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comigt_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comigt_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comigt_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 - : 0; + uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); + return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); #else - return a.f32[0] > b.f32[0]; + return a_.f32[0] > b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comile_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comile_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comile_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0; + uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); + return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); #else - return a.f32[0] <= b.f32[0]; + return a_.f32[0] <= b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comilt_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comilt_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comilt_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0; + uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); + return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); #else - return a.f32[0] < b.f32[0]; + return a_.f32[0] < b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comineq_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_comineq_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_comineq_ss(a, b); +#else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); + uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) - ? 1 - : 0; + uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); + return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); #else - return a.f32[0] != b.f32[0]; + return a_.f32[0] != b_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvt_pi2ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), + vget_high_f32(a_.neon_f32)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); + r_.m64_private[1] = a_.m64_private[1]; -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvt_pi2ps(a.n, b.n); #else - r.f32[0] = (simde_float32)b.i32[0]; - r.f32[1] = (simde_float32)b.i32[1]; - r.i32[2] = a.i32[2]; - r.i32[3] = a.i32[3]; + r_.f32[0] = (simde_float32)b_.i32[0]; + r_.f32[1] = (simde_float32)b_.i32[1]; + r_.i32[2] = a_.i32[2]; + r_.i32[3] = a_.i32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvt_ps2pi(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvt_ps2pi(a); +#else + simde__m64_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvt_ps2pi(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); +#elif defined(SIMDE_CONVERT_VECTOR_) && !defined(__clang__) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f32[i]); } #endif - return r; + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cvt_si2ss(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvt_si2ss(a.n, b); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vsetq_lane_f32((float)b, a_.neon_f32, 0); #else - r.f32[0] = (simde_float32)b; - r.i32[1] = a.i32[1]; - r.i32[2] = a.i32[2]; - r.i32[3] = a.i32[3]; + r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); + r_.i32[1] = a_.i32[1]; + r_.i32[2] = a_.i32[2]; + r_.i32[3] = a_.i32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvt_ss2si(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_cvt_ss2si(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cvt_ss2si(a); #else - return (int32_t)a.f32[0]; + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vgetq_lane_s32(vcvtnq_s32_f32(a_.neon_f32), 0); +#elif defined(simde_math_nearbyintf) + return SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[0])); +#else + HEDLEY_UNREACHABLE(); +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi16_ps(simde__m64 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpi16_ps(a); +#else + simde__m128_private r_; + simde__m64_private a_ = simde__m64_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpi16_ps(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && 0 /* TODO */ + r_.neon_f32 = vmovl_s16( + vget_low_s16(vuzp1q_s16(a_.neon_i16, vmovq_n_s16(0)))); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = (simde_float32)a.i16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + simde_float32 v = a_.i16[i]; + r_.f32[i] = v; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpi32_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); + simde__m64_private b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpi32_ps(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), + vget_high_f32(a_.neon_f32)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); + r_.m64_private[1] = a_.m64_private[1]; #else - r.f32[0] = (simde_float32)b.i32[0]; - r.f32[1] = (simde_float32)b.i32[1]; - r.i32[2] = a.i32[2]; - r.i32[3] = a.i32[3]; + r_.f32[0] = (simde_float32)b_.i32[0]; + r_.f32[1] = (simde_float32)b_.i32[1]; + r_.i32[2] = a_.i32[2]; + r_.i32[3] = a_.i32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpi32x2_ps(a, b); +#else + simde__m128_private r_; + simde__m64_private a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpi32x2_ps(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32); + SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32); #else - r.f32[0] = (simde_float32)a.i32[0]; - r.f32[1] = (simde_float32)a.i32[1]; - r.f32[2] = (simde_float32)b.i32[0]; - r.f32[3] = (simde_float32)b.i32[1]; + r_.f32[0] = (simde_float32)a_.i32[0]; + r_.f32[1] = (simde_float32)a_.i32[1]; + r_.f32[2] = (simde_float32)b_.i32[0]; + r_.f32[3] = (simde_float32)b_.i32[1]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi8_ps(simde__m64 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpi8_ps(a); +#else + simde__m128_private r_; + simde__m64_private a_ = simde__m64_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpi8_ps(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8)))); #else - r.f32[0] = (simde_float32)a.i8[0]; - r.f32[1] = (simde_float32)a.i8[1]; - r.f32[2] = (simde_float32)a.i8[2]; - r.f32[3] = (simde_float32)a.i8[3]; + r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]); + r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]); + r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]); + r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi16(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtps_pi16(a); +#else + simde__m64_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtps_pi16(a.n); +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i16, a_.f32); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(a_.neon_f32)); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (int16_t)a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, a_.f32[i]); } #endif - return r; + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi32(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtps_pi32(a); +#else + simde__m64_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtps_pi32(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]); } #endif - return r; + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi8(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtps_pi8(a); +#else + simde__m64_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtps_pi8(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int16x4_t b = vmovn_s32(vcvtq_s32_f32(a_.neon_f32)); + int16x8_t c = vcombine_s16(b, vmov_n_s16(0)); + r_.neon_i8 = vmovn_s16(c); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(a.f32) / sizeof(a.f32[0])); i++) { - r.i8[i] = (int8_t)a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(a_.f32) / sizeof(a_.f32[0])); i++) { + r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, a_.f32[i]); } + /* Note: the upper half is undefined */ #endif - return r; + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu16_ps(simde__m64 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpu16_ps(a); +#else + simde__m128_private r_; + simde__m64_private a_ = simde__m64_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpu16_ps(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = (simde_float32)a.u16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = (simde_float32)a_.u16[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu8_ps(simde__m64 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpu8_ps(a); +#else + simde__m128_private r_; + simde__m64_private a_ = simde__m64_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtpu8_ps(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8)))); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < 4; i++) { - r.f32[i] = (simde_float32)a.u8[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]); } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cvtsi32_ss(a, b); +#else + simde__m128_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtsi32_ss(a.n, b); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vsetq_lane_f32((simde_float32)b, a_.neon_f32, 0); #else - r.f32[0] = (simde_float32)b; - SIMDE__VECTORIZE - for (size_t i = 1; i < 4; i++) { - r.i32[i] = a.i32[i]; + r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) #if !defined(__PGI) - r.n = _mm_cvtsi64_ss(a.n, b); + return _mm_cvtsi64_ss(a, b); #else - r.n = _mm_cvtsi64x_ss(a.n, b); + return _mm_cvtsi64x_ss(a, b); #endif #else - r.f32[0] = (simde_float32)b; - SIMDE__VECTORIZE - for (size_t i = 1; i < 4; i++) { - r.i32[i] = a.i32[i]; - } + simde__m128_private r_; + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vsetq_lane_f32((simde_float32)b, a_.neon_f32, 0); +#else + r_ = a_; + r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde_float32 simde_mm_cvtss_f32(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_cvtss_f32(a.n); -#elif defined(SIMDE_SSE_NEON) - return vgetq_lane_f32(a.neon_f32, 0); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cvtss_f32(a); +#else + simde__m128_private a_ = simde__m128_to_private(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vgetq_lane_f32(a_.neon_f32, 0); #else - return a.f32[0]; + return a_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvtss_si32(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_cvtss_si32(a.n); -#else - return (int32_t)a.f32[0]; -#endif + return simde_mm_cvt_ss2si(a); } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int64_t simde_mm_cvtss_si64(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) #if !defined(__PGI) - return _mm_cvtss_si64(a.n); + return _mm_cvtss_si64(a); #else - return _mm_cvtss_si64x(a.n); + return _mm_cvtss_si64x(a); #endif #else - return (int64_t)a.f32[0]; + simde__m128_private a_ = simde__m128_to_private(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); +#else + return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtt_ps2pi(a); +#else + simde__m64_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvtt_ps2pi(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.i32[i] = (int32_t)truncf(a.f32[i]); + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]); } #endif - return r; + return simde__m64_from_private(r_); +#endif } +#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a)) +#define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvtt_ss2si(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_cvtt_ss2si(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cvtt_ss2si(a); +#else + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0)); #else - return (int32_t)truncf(a.f32[0]); + return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]); +#endif #endif } +#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a)) +#define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m64 simde_mm_cvttps_pi32(simde__m128 a) +SIMDE_FUNCTION_ATTRIBUTES +int64_t simde_mm_cvttss_si64(simde__m128 a) { - simde__m64 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_cvttps_pi32(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && \ + !defined(_MSC_VER) +#if defined(__PGI) + return _mm_cvttss_si64x(a); #else - r = simde_mm_cvtt_ps2pi(a); + return _mm_cvttss_si64(a); #endif +#else + simde__m128_private a_ = simde__m128_to_private(a); - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES -int32_t simde_mm_cvttss_si32(simde__m128 a) -{ -#if defined(SIMDE_SSE_NATIVE) - return _mm_cvttss_si32(a.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); #else - return (int32_t)truncf(a.f32[0]); + return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES -int64_t simde_mm_cvttss_si64(simde__m128 a) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) -#if defined(__PGI) - return _mm_cvttss_si64x(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_cmpord_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); + +#if defined(simde_math_isnanf) + r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || + simde_math_isnanf(simde_mm_cvtss_f32(b))) + ? UINT32_C(0) + : ~UINT32_C(0); + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.u32[i] = a_.u32[i]; + } #else - return _mm_cvttss_si64(a.n); + HEDLEY_UNREACHABLE(); #endif -#else - return (int64_t)truncf(a.f32[0]); + + return simde__m128_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_div_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t recip0 = vrecpeq_f32(b.neon_f32); - float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32)); - r.neon_f32 = vmulq_f32(a.neon_f32, recip1); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i] / b.f32[i]; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_div_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x4_t recip0 = vrecpeq_f32(b_.neon_f32); + float32x4_t recip1 = + vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32)); + r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f32 = a_.f32 / b_.f32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = a_.f32[i] / b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_div_ps(a, b) simde_mm_div_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_div_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32_t value = vgetq_lane_f32(simde_mm_div_ps(a, b).neon_f32, 0); - r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_div_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_div_ps(a, b)); #else - r.f32[0] = a.f32[0] / b.f32[0]; - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i]; + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + + r_.f32[0] = a_.f32[0] / b_.f32[0]; + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = a_.f32[i]; } -#endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_div_ss(a, b) simde_mm_div_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES -int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8) +SIMDE_FUNCTION_ATTRIBUTES +int16_t simde_mm_extract_pi16(simde__m64 a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 3) { - return a.u16[imm8]; + simde__m64_private a_ = simde__m64_to_private(a); + return a_.i16[imm8]; } -#if defined(SIMDE_SSE_NATIVE) -#define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8) +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \ + !defined(HEDLEY_PGI_VERSION) +#if HEDLEY_HAS_WARNING("-Wvector-conversion") +/* https://bugs.llvm.org/show_bug.cgi?id=44589 */ +#define simde_mm_extract_pi16(a, imm8) \ + (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \ + "clang diagnostic ignored \"-Wvector-conversion\"") \ + HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \ + HEDLEY_DIAGNOSTIC_POP) +#else +#define simde_mm_extract_pi16(a, imm8) \ + HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8)) +#endif +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_extract_pi16(a, imm8) \ + vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8) +#endif +#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8)) #endif -#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8) enum { -#if defined(SIMDE_SSE_NATIVE) - simde_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, - simde_MM_ROUND_DOWN = _MM_ROUND_DOWN, - simde_MM_ROUND_UP = _MM_ROUND_UP, - simde_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO +#if defined(SIMDE_X86_SSE_NATIVE) + SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, + SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN, + SIMDE_MM_ROUND_UP = _MM_ROUND_UP, + SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO #else - simde_MM_ROUND_NEAREST + SIMDE_MM_ROUND_NEAREST #if defined(FE_TONEAREST) = FE_TONEAREST #endif , - simde_MM_ROUND_DOWN + SIMDE_MM_ROUND_DOWN #if defined(FE_DOWNWARD) = FE_DOWNWARD #endif , - simde_MM_ROUND_UP + SIMDE_MM_ROUND_UP #if defined(FE_UPWARD) = FE_UPWARD #endif , - simde_MM_ROUND_TOWARD_ZERO + SIMDE_MM_ROUND_TOWARD_ZERO #if defined(FE_TOWARDZERO) = FE_TOWARDZERO #endif #endif }; -SIMDE__FUNCTION_ATTRIBUTES -unsigned int simde_MM_GET_ROUNDING_MODE(void) +SIMDE_FUNCTION_ATTRIBUTES +unsigned int SIMDE_MM_GET_ROUNDING_MODE(void) { -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) return _MM_GET_ROUNDING_MODE(); +#elif defined(SIMDE_HAVE_FENV_H) + return HEDLEY_STATIC_CAST(unsigned int, fegetround()); #else - return fegetround(); + HEDLEY_UNREACHABLE(); #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE() +#endif -SIMDE__FUNCTION_ATTRIBUTES -void simde_MM_SET_ROUNDING_MODE(unsigned int a) +SIMDE_FUNCTION_ATTRIBUTES +void SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) { -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) _MM_SET_ROUNDING_MODE(a); -#else - fesetround((int)a); +#elif defined(SIMDE_HAVE_FENV_H) + fesetround(HEDLEY_STATIC_CAST(int, a)); #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 3) { - simde__m64 r; - r.i64[0] = a.i64[0]; - r.i16[imm8] = i; - return r; + simde__m64_private r_, a_ = simde__m64_to_private(a); + + r_.i64[0] = a_.i64[0]; + r_.i16[imm8] = i; + + return simde__m64_from_private(r_); } -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \ + !defined(__PGI) +#if HEDLEY_HAS_WARNING("-Wvector-conversion") +/* https://bugs.llvm.org/show_bug.cgi?id=44589 */ +#define ssimde_mm_insert_pi16(a, i, imm8) \ + (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \ + "clang diagnostic ignored \"-Wvector-conversion\"")( \ + _mm_insert_pi16((a), (i), (imm8))) HEDLEY_DIAGNOSTIC_POP) +#else +#define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8) +#endif +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_insert_pi16(a, i, imm8) \ - SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8)); + simde__m64_from_neon_i16( \ + vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8))) +#endif +#define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8)) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) +#define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) #endif -#define simde_m_pinsrw(a, i, imm8) \ - SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8)); -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - simde__m128 r; - simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_load_ps(mem_addr); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vld1q_f32(mem_addr); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_load_ps(mem_addr); +#else + simde__m128_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vld1q_f32(mem_addr); +#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + r_.altivec_f32 = vec_vsx_ld(0, mem_addr); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_ld(0, mem_addr); #else - memcpy(&r, mem_addr, sizeof(r.f32)); + r_ = *SIMDE_ALIGN_CAST(simde__m128_private const *, mem_addr); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_load_ps1(mem_addr); +#else + simde__m128_private r_; -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_load_ps1(mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vld1q_dup_f32(mem_addr); #else - const simde_float32 v = *mem_addr; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.f32[i] = v; - } + r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr)); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#define simde_mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_load_ps1(mem_addr) simde_mm_load_ps1(mem_addr) +#define _mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_load_ss(mem_addr); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_load_ss(mem_addr); #else - r.f32[0] = *mem_addr; - r.i32[1] = 0; - r.i32[2] = 0; - r.i32[3] = 0; -#endif + simde__m128_private r_; - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr) -{ - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_load1_ps(mem_addr); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vld1q_dup_f32(mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); #else - r = simde_mm_load_ps1(mem_addr); + r_.f32[0] = *mem_addr; + r_.i32[1] = 0; + r_.i32[2] = 0; + r_.i32[3] = 0; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_loadh_pi(a, + HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcombine_f32( + vget_low_f32(a_.neon_f32), + vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr))); #else - r.f32[0] = a.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = mem_addr->f32[0]; - r.f32[3] = mem_addr->f32[1]; + simde__m64_private b_ = + *HEDLEY_REINTERPRET_CAST(simde__m64_private const *, mem_addr); + r_.f32[0] = a_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = b_.f32[0]; + r_.f32[3] = b_.f32[1]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_loadh_pi(a, mem_addr) \ + simde_mm_loadh_pi((a), (simde__m64 const *)(mem_addr)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +/* The SSE documentation says that there are no alignment requirements + for mem_addr. Unfortunately they used the __m64 type for the argument + which is supposed to be 8-byte aligned, so some compilers (like clang + with -Wcast-align) will generate a warning if you try to cast, say, + a simde_float32* to a simde__m64* for this function. + + I think the choice of argument type is unfortunate, but I do think we + need to stick to it here. If there is demand I can always add something + like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */ +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_loadl_pi(a, + HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcombine_f32( + vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)), + vget_high_f32(a_.neon_f32)); #else - r.f32[0] = mem_addr->f32[0]; - r.f32[1] = mem_addr->f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; + simde__m64_private b_; + simde_memcpy(&b_, mem_addr, sizeof(b_)); + r_.i32[0] = b_.i32[0]; + r_.i32[1] = b_.i32[1]; + r_.i32[2] = a_.i32[2]; + r_.i32[3] = a_.i32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_loadl_pi(a, mem_addr) \ + simde_mm_loadl_pi((a), (simde__m64 const *)(mem_addr)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - simde__m128 r; - simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_loadr_ps(mem_addr); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_loadr_ps(mem_addr); +#else + simde__m128_private r_, + v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr)); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vrev64q_f32(v_.neon_f32); + r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0); #else - r.f32[0] = mem_addr[3]; - r.f32[1] = mem_addr[2]; - r.f32[2] = mem_addr[1]; - r.f32[3] = mem_addr[0]; + r_.f32[0] = v_.f32[3]; + r_.f32[1] = v_.f32[2]; + r_.f32[2] = v_.f32[1]; + r_.f32[3] = v_.f32[0]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_loadu_ps(mem_addr); +#else + simde__m128_private r_; -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_loadu_ps(mem_addr); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vld1q_f32(mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = + vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)); #else - r.f32[0] = mem_addr[0]; - r.f32[1] = mem_addr[1]; - r.f32[2] = mem_addr[2]; - r.f32[3] = mem_addr[3]; + r_.f32[0] = mem_addr[0]; + r_.f32[1] = mem_addr[1]; + r_.f32[2] = mem_addr[2]; + r_.f32[3] = mem_addr[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES -void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr) +SIMDE_FUNCTION_ATTRIBUTES +void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, int8_t *mem_addr) { -#if defined(SIMDE_SSE_NATIVE) - _mm_maskmove_si64(a.n, mask.n, mem_addr); +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr)); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(a.i8) / sizeof(a.i8[0])); i++) - if (mask.i8[i] < 0) - mem_addr[i] = a.i8[i]; + simde__m64_private a_ = simde__m64_to_private(a), + mask_ = simde__m64_to_private(mask); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++) + if (mask_.i8[i] < 0) + mem_addr[i] = a_.i8[i]; #endif } #define simde_m_maskmovq(a, mask, mem_addr) \ simde_mm_maskmove_si64(a, mask, mem_addr) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_maskmove_si64(a, mask, mem_addr) \ + simde_mm_maskmove_si64( \ + (a), (mask), \ + SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_max_pi16(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_max_pi16(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b) +#define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_max_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_max_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_max_ps(a, b) simde_mm_max_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_max_pu8(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_max_pu8(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b) +#define _m_pmaxub(a, b) simde_mm_max_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_max_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0); - r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_max_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_max_ps(a, b)); #else - r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); - return r; + r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_max_ss(a, b) simde_mm_max_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_min_pi16(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_min_pi16(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b) +#define _m_pminsw(a, b) simde_mm_min_pi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_min_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_min_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_min_ps(a, b) simde_mm_min_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_min_pu8(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_min_pu8(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; } #endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b) +#define _m_pminub(a, b) simde_mm_min_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_min_ss(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0); - r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_min_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_min_ps(a, b)); #else - r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif - - return r; -} + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b) -{ - simde__m128 r; + r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_move_ss(a.n, b.n); -#else - r.f32[0] = b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; + return simde__m128_from_private(r_); #endif - - return r; } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_min_ss(a, b) simde_mm_min_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_movehl_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_movehl_ps(a.n, b.n); +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3); #else - r.f32[0] = b.f32[2]; - r.f32[1] = b.f32[3]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; + r_.f32[0] = b_.f32[2]; + r_.f32[1] = b_.f32[3]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_movelh_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_movelh_ps(a.n, b.n); +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); #else - r.f32[0] = a.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = b.f32[0]; - r.f32[3] = b.f32[1]; + r_.f32[0] = a_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = b_.f32[0]; + r_.f32[3] = b_.f32[1]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_movemask_pi8(simde__m64 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_movemask_pi8(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_movemask_pi8(a); #else + simde__m64_private a_ = simde__m64_to_private(a); int r = 0; - const size_t nmemb = sizeof(a.i8) / sizeof(a.i8[0]); + const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]); - SIMDE__VECTORIZE_REDUCTION(| : r) + SIMDE_VECTORIZE_REDUCTION(| : r) for (size_t i = 0; i < nmemb; i++) { - r |= (a.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); + r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); } return r; #endif } #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_movemask_ps(simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_movemask_ps(a.n); -#elif defined(SIMDE_SSE_NEON) +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_movemask_ps(a); +#else + int r = 0; + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) /* TODO: check to see if NEON version is faster than the portable version */ static const uint32x4_t movemask = {1, 2, 4, 8}; static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; - uint32x4_t t0 = a.neon_u32; + uint32x4_t t0 = a_.neon_u32; uint32x4_t t1 = vtstq_u32(t0, highbit); uint32x4_t t2 = vandq_u32(t1, movemask); uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); - return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); + r = vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); #else - int r = 0; - - SIMDE__VECTORIZE_REDUCTION(| : r) - for (size_t i = 0; i < sizeof(a.u32) / sizeof(a.u32[0]); i++) { - r |= (a.u32[i] >> ((sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i; + SIMDE_VECTORIZE_REDUCTION(| : r) + for (size_t i = 0; i < sizeof(a_.u32) / sizeof(a_.u32[0]); i++) { + r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i; } +#endif return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_movemask_ps(a) simde_mm_movemask_ps((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_mul_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_mul_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f32 = a_.f32 * b_.f32; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i] * b.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = a_.f32[i] * b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_mul_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_mul_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_mul_ps(a, b)); #else - r.f32[0] = a.f32[0] * b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); - return r; + r_.f32[0] = a_.f32[0] * b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b) { - simde__m64 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_mulhi_pu16(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_mulhi_pu16(a, b); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = (a.u16[i] * b.u16[i]) >> 16; + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST( + uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * + HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> + UINT32_C(16))); } -#endif - return r; + return simde__m64_from_private(r_); +#endif } #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_or_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_or_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f | b_.i32f; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { - r.u32[i] = a.u32[i] | b.u32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] | b_.u32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_or_ps(a, b) simde_mm_or_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i) { (void)p; (void)i; } -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) #define simde_mm_prefetch(p, i) _mm_prefetch(p, i) #endif +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_prefetch(p, i) simde_mm_prefetch(p, i) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ps(simde__m128 a) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_rcp_ps(a.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t recip = vrecpeq_f32(a.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_rcp_ps(a); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if !defined(SIMDE_MM_RCP_PS_ITERS) -#define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS -#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x4_t recip = vrecpeq_f32(a_.neon_f32); - for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) { - recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32)); +#if SIMDE_ACCURACY_PREFERENCE > 0 + for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE; ++i) { + recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32)); } +#endif - r.neon_f32 = recip; + r_.neon_f32 = recip; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_re(a_.altivec_f32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.f32 = 1.0f / a_.f32; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = 1.0f / a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = 1.0f / a_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_rcp_ps(a) simde_mm_rcp_ps((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ss(simde__m128 a) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_rcp_ss(a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_rcp_ss(a); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_rcp_ps(a)); #else - r.f32[0] = 1.0f / a.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif + simde__m128_private r_, a_ = simde__m128_to_private(a); - return r; + r_.f32[0] = 1.0f / a_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_rcp_ss(a) simde_mm_rcp_ss((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_rsqrt_ps(simde__m128 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_rsqrt_ps(a); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_rsqrt_ps(a.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vrsqrteq_f32(a.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vrsqrteq_f32(a_.neon_f32); #elif defined(__STDC_IEC_559__) - /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */ - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1); + /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf + Pages 100 - 103 */ + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { +#if SIMDE_ACCURACY_PREFERENCE <= 0 + r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1); +#else + simde_float32 x = a_.f32[i]; + simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; + int32_t ix; -#if SIMDE_ACCURACY_ITERS > 2 - const float half = SIMDE_FLOAT32_C(0.5) * a.f32[i]; - for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++) - r.f32[i] *= SIMDE_FLOAT32_C(1.5) - - (half * r.f32[i] * r.f32[i]); -#endif - } + simde_memcpy(&ix, &x, sizeof(ix)); + +#if SIMDE_ACCURACY_PREFERENCE == 1 + ix = INT32_C(0x5F375A82) - (ix >> 1); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = 1.0f / sqrtf(a.f32[i]); - } + ix = INT32_C(0x5F37599E) - (ix >> 1); #endif - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_rsqrt_ss(simde__m128 a) -{ - simde__m128 r; + simde_memcpy(&x, &ix, sizeof(x)); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_rsqrt_ss(a.n); -#elif defined(__STDC_IEC_559__) - { - r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1); +#if SIMDE_ACCURACY_PREFERENCE >= 2 + x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); +#endif + x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); -#if SIMDE_ACCURACY_ITERS > 2 - float half = SIMDE_FLOAT32_C(0.5) * a.f32[0]; - for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++) - r.f32[0] *= SIMDE_FLOAT32_C(1.5) - - (half * r.f32[0] * r.f32[0]); + r_.f32[i] = x; #endif } - r.f32[0] = 1.0f / sqrtf(a.f32[0]); - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; +#elif defined(simde_math_sqrtf) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]); + } #else - r.f32[0] = 1.0f / sqrtf(a.f32[0]); - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_rsqrt_ss(simde__m128 a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_rsqrt_ss(a); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_sad_pu8(a.n, b.n); +#if defined(__STDC_IEC_559__) + { +#if SIMDE_ACCURACY_PREFERENCE <= 0 + r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1); #else - uint16_t sum = 0; + simde_float32 x = a_.f32[0]; + simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; + int32_t ix; - SIMDE__VECTORIZE_REDUCTION(+ : sum) - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - sum += (uint8_t)abs(a.u8[i] - b.u8[i]); - } + simde_memcpy(&ix, &x, sizeof(ix)); - r.i16[0] = sum; - r.i16[1] = 0; - r.i16[2] = 0; - r.i16[3] = 0; +#if SIMDE_ACCURACY_PREFERENCE == 1 + ix = INT32_C(0x5F375A82) - (ix >> 1); +#else + ix = INT32_C(0x5F37599E) - (ix >> 1); #endif - return r; -} -#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) + simde_memcpy(&x, &ix, sizeof(x)); -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2, - simde_float32 e1, simde_float32 e0) -{ - simde__m128 r; +#if SIMDE_ACCURACY_PREFERENCE >= 2 + x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); +#endif + x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_set_ps(e3, e2, e1, e0); -#elif defined(SIMDE_SSE_NEON) - SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3}; - r.neon_f32 = vld1q_f32(data); + r_.f32[0] = x; +#endif + } + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; +#elif defined(simde_math_sqrtf) + r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]); + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; #else - r.f32[0] = e0; - r.f32[1] = e1; - r.f32[2] = e2; - r.f32[3] = e3; + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_set_ps1(simde_float32 a) +SIMDE_FUNCTION_ATTRIBUTES +simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_sad_pu8(a, b); +#else + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + uint16_t sum = 0; + +#if defined(SIMDE_HAVE_STDLIB_H) + SIMDE_VECTORIZE_REDUCTION(+ : sum) + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i])); + } -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_set1_ps(a); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vdupq_n_f32(a); + r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum); + r_.i16[1] = 0; + r_.i16[2] = 0; + r_.i16[3] = 0; #else - r = simde_mm_set_ps(a, a, a, a); + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m64_from_private(r_); +#endif } -#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) +#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b) +#define _m_psadbw(a, b) simde_mm_sad_pu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ss(simde_float32 a) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_set_ss(a); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_set_ss(a); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0); #else - r = simde_mm_set_ps(0, 0, 0, a); + return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), + SIMDE_FLOAT32_C(0.0), a); #endif - - return r; } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_set_ss(a) simde_mm_set_ss(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_setr_ps(e3, e2, e1, e0); -#elif defined(SIMDE_SSE_NEON) +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_setr_ps(e3, e2, e1, e0); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN(16) simde_float32 data[4] = {e3, e2, e1, e0}; - r.neon_f32 = vld1q_f32(data); + return vld1q_f32(data); #else - r = simde_mm_set_ps(e0, e1, e2, e3); + return simde_mm_set_ps(e0, e1, e2, e3); #endif - - return r; } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_setzero_ps(void) { +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_setzero_ps(); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_f32(SIMDE_FLOAT32_C(0.0)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + return vec_splats((float)0); +#else simde__m128 r; + simde_memset(&r, 0, sizeof(r)); + return r; +#endif +} +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_setzero_ps() simde_mm_setzero_ps() +#endif -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_setzero_ps(); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vdupq_n_f32(0.0f); -#else - r = simde_mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif - return r; +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_mm_undefined_ps(void) +{ + simde__m128_private r_; + +#if defined(SIMDE_HAVE_UNDEFINED128) + r_.n = _mm_undefined_ps(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r_ = simde__m128_to_private(simde_mm_setzero_ps()); +#endif + + return simde__m128_from_private(r_); +} +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_undefined_ps() simde_mm_undefined_ps() +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 simde_x_mm_setone_ps(void) +{ + simde__m128 t = simde_mm_setzero_ps(); + return simde_mm_cmpeq_ps(t, t); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_sfence(void) { /* TODO: Use Hedley. */ -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) _mm_sfence(); #elif defined(__GNUC__) && \ ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) @@ -2062,516 +2821,783 @@ void simde_mm_sfence(void) #endif #elif defined(_MSC_VER) MemoryBarrier(); -#elif defined(__GNUC__) && \ - ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) - __atomic_thread_fence(__ATOMIC_SEQ_CST); -#elif HEDLEY_CLANG_HAS_FEATURE(c_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST) +#elif HEDLEY_HAS_EXTENSION(c_atomic) + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__GNUC__) && \ ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) __sync_synchronize(); -#elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \ - (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140)) - __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(_OPENMP) #pragma omp critical(simde_mm_sfence_) { } #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sfence() simde_mm_sfence() +#endif #define SIMDE_MM_SHUFFLE(z, y, x, w) \ (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w) +#endif + +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \ + !defined(__PGI) +#define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shuffle_pi16(a, imm8) \ + (__extension__({ \ + const simde__m64_private simde__tmp_a_ = \ + simde__m64_to_private(a); \ + simde__m64_from_private((simde__m64_private){ \ + .i16 = SIMDE_SHUFFLE_VECTOR_( \ + 16, 8, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, (((imm8)) & 3), \ + (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3))}); \ + })) +#else +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m64 r; - for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) { - r.i16[i] = a.i16[(imm8 >> (i * 2)) & 3]; + simde__m64_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + + for (size_t i = 0; i < sizeof(r_.i16) / sizeof(r_.i16[0]); i++) { + r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3]; } - return r; + + HEDLEY_DIAGNOSTIC_PUSH +#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized") +#pragma clang diagnostic ignored "-Wconditional-uninitialized" +#endif + return simde__m64_from_private(r_); + HEDLEY_DIAGNOSTIC_POP } -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) -#define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8)) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shuffle_pi16(a, imm8) \ - ({ \ - const simde__m64 simde__tmp_a_ = a; \ - (simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR( \ - 16, 8, (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, (((imm8)) & 3), \ - (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3))}; \ - }) #endif - -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) -#define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8)) +#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) +#define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8) #else #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) #endif - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8) +#define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) +#endif + +#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) +#define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shuffle_ps(a, b, imm8) \ + (__extension__({ \ + simde__m128_from_private((simde__m128_private){ \ + .f32 = SIMDE_SHUFFLE_VECTOR_( \ + 32, 16, simde__m128_to_private(a).f32, \ + simde__m128_to_private(b).f32, (((imm8)) & 3), \ + (((imm8) >> 2) & 3), (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4)}); \ + })) +#else +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128 r; - r.f32[0] = a.f32[(imm8 >> 0) & 3]; - r.f32[1] = a.f32[(imm8 >> 2) & 3]; - r.f32[2] = b.f32[(imm8 >> 4) & 3]; - r.f32[3] = b.f32[(imm8 >> 6) & 3]; - return r; + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + + r_.f32[0] = a_.f32[(imm8 >> 0) & 3]; + r_.f32[1] = a_.f32[(imm8 >> 2) & 3]; + r_.f32[2] = b_.f32[(imm8 >> 4) & 3]; + r_.f32[3] = b_.f32[(imm8 >> 6) & 3]; + + return simde__m128_from_private(r_); } -#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) -#define simde_mm_shuffle_ps(a, b, imm8) \ - SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8)) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shuffle_ps(a, b, imm8) \ - ({ \ - (simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR( \ - 32, 16, (a).f32, (b).f32, \ - (((imm8)) & 3), (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4)}; \ - }) -#endif - -SIMDE__FUNCTION_ATTRIBUTES +#endif +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ps(simde__m128 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_sqrt_ps(a); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_sqrt_ps(a.n); -#elif defined(SIMDE_SSE_NEON) - float32x4_t recipsq = vrsqrteq_f32(a.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x4_t recipsq = vrsqrteq_f32(a_.neon_f32); float32x4_t sq = vrecpeq_f32(recipsq); /* ??? use step versions of both sqrt and recip for better accuracy? */ - r.neon_f32 = sq; -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < sizeof(r.f32) / sizeof(r.f32[0]); i++) { - r.f32[i] = sqrtf(a.f32[i]); + r_.neon_f32 = sq; +#elif defined(simde_math_sqrt) + SIMDE_VECTORIZE + for (size_t i = 0; i < sizeof(r_.f32) / sizeof(r_.f32[0]); i++) { + r_.f32[i] = simde_math_sqrtf(a_.f32[i]); } +#else + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ss(simde__m128 a) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_sqrt_ss(a); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_sqrt_ps(a)); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_sqrt_ss(a.n); -#elif defined(SIMDE_SSE_NEON) - float32_t value = vgetq_lane_f32(simde_mm_sqrt_ps(a).neon_f32, 0); - r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0); +#if defined(simde_math_sqrtf) + r_.f32[0] = simde_math_sqrtf(a_.f32[0]); + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; #else - r.f32[0] = sqrtf(a.f32[0]); - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; + HEDLEY_UNREACHABLE(); #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - _mm_store_ps(mem_addr, a.n); -#elif defined(SIMDE_SSE_NEON) - vst1q_f32(mem_addr, a.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_store_ps(mem_addr, a); +#else + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_f32(mem_addr, a_.neon_f32); +#elif defined(SIMDE_POWER_ALTIVE_P7_NATIVE) + vec_vsx_st(a_.altivec_32, 0, mem_addr); +#elif defined(SIMDE_POWER_ALTIVE_P5_NATIVE) + vec_st(a_.altivec_32, 0, mem_addr); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store(mem_addr, a_.wasm_v128); #else - SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) - for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { - mem_addr[i] = a.f32[i]; + SIMDE_VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) { + mem_addr[i] = a_.f32[i]; } #endif +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_store_ps(mem_addr, a) \ + simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - _mm_store_ps1(mem_addr, a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_store_ps1(mem_addr, a); #else - SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) - for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { - mem_addr[i] = a.f32[0]; + simde__m128_private a_ = simde__m128_to_private(a); + + SIMDE_VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) { + mem_addr[i] = a_.f32[0]; } #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_store_ps1(mem_addr, a) \ + simde_mm_store_ps1(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - _mm_store_ss(mem_addr, a.n); -#elif defined(SIMDE_SSE_NEON) - vst1q_lane_f32(mem_addr, a.neon_f32, 0); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_store_ss(mem_addr, a); +#else + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_lane_f32(mem_addr, a_.neon_f32, 0); #else - *mem_addr = a.f32[0]; + *mem_addr = a_.f32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_store_ss(mem_addr, a) \ + simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - _mm_store1_ps(mem_addr, a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_store1_ps(mem_addr, a); #else simde_mm_store_ps1(mem_addr, a); #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_store1_ps(mem_addr, a) \ + simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - _mm_storeh_pi(&(mem_addr->n), a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a); #else - mem_addr->f32[0] = a.f32[2]; - mem_addr->f32[1] = a.f32[3]; + simde__m64_private *dest_ = + HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr); + simde__m128_private a_ = simde__m128_to_private(a); + + dest_->f32[0] = a_.f32[2]; + dest_->f32[1] = a_.f32[3]; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - _mm_storel_pi(&(mem_addr->n), a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a); #else - mem_addr->f32[0] = a.f32[0]; - mem_addr->f32[1] = a.f32[1]; + simde__m64_private *dest_ = + HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr); + simde__m128_private a_ = simde__m128_to_private(a); + + dest_->f32[0] = a_.f32[0]; + dest_->f32[1] = a_.f32[1]; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - _mm_storer_ps(mem_addr, a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_storer_ps(mem_addr, a); #else - SIMDE__VECTORIZE_ALIGNED(mem_addr : 16) - for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_SHUFFLE_VECTOR_) + a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0); + simde_mm_store_ps(mem_addr, simde__m128_from_private(a_)); +#else + SIMDE_VECTORIZE_ALIGNED(mem_addr : 16) + for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) { mem_addr[i] = - a.f32[((sizeof(a.f32) / sizeof(a.f32[0])) - 1) - i]; + a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i]; } #endif +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_storer_ps(mem_addr, a) \ + simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a) { -#if defined(SIMDE_SSE_NATIVE) - _mm_storeu_ps(mem_addr, a.n); -#elif defined(SIMDE_SSE_NEON) - vst1q_f32(mem_addr, a.neon_f32); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_storeu_ps(mem_addr, a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) { - mem_addr[i] = a.f32[i]; - } + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_f32(mem_addr, a_.neon_f32); +#else + simde_memcpy(mem_addr, &a_, sizeof(a_)); +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_storeu_ps(mem_addr, a) \ + simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_sub_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_sub_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f32 = a_.f32 - b_.f32; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = a.f32[i] - b.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = a_.f32[i] - b_.f32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b) { - simde__m128 r; - -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_sub_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_sub_ss(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_ss(a, simde_mm_sub_ps(a, b)); #else - r.f32[0] = a.f32[0] - b.f32[0]; - r.f32[1] = a.f32[1]; - r.f32[2] = a.f32[2]; - r.f32[3] = a.f32[3]; -#endif + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); - return r; + r_.f32[0] = a_.f32[0] - b_.f32[0]; + r_.f32[1] = a_.f32[1]; + r_.f32[2] = a_.f32[2]; + r_.f32[3] = a_.f32[3]; + + return simde__m128_from_private(r_); +#endif } -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomieq_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomieq_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] == b.f32[0]; + r = a_.f32[0] == b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] == b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomige_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomige_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] >= b.f32[0]; + r = a_.f32[0] >= b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] >= b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomigt_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomigt_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] > b.f32[0]; + r = a_.f32[0] > b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] > b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomile_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomile_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] <= b.f32[0]; + r = a_.f32[0] <= b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] <= b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomilt_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomilt_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] < b.f32[0]; + r = a_.f32[0] < b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] < b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b) { -#if defined(SIMDE_SSE_NATIVE) - return _mm_ucomineq_ss(a.n, b.n); +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_ucomineq_ss(a, b); #else + simde__m128_private a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f32[0] != b.f32[0]; + r = a_.f32[0] != b_.f32[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f32[0] != b_.f32[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b)) +#endif -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) #if defined(__has_builtin) #if __has_builtin(__builtin_ia32_undef128) -#define SIMDE__HAVE_UNDEFINED128 +#define SIMDE_HAVE_UNDEFINED128 #endif -#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) -#define SIMDE__HAVE_UNDEFINED128 +#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && \ + !defined(_MSC_VER) +#define SIMDE_HAVE_UNDEFINED128 #endif #endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128 simde_mm_undefined_ps(void) -{ - simde__m128 r; - -#if defined(SIMDE__HAVE_UNDEFINED128) - r.n = _mm_undefined_ps(); -#else - r = simde_mm_setzero_ps(); +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_unpackhi_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_unpackhi_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x2_t a1 = vget_high_f32(a.neon_f32); - float32x2_t b1 = vget_high_f32(b.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x2_t a1 = vget_high_f32(a_.neon_f32); + float32x2_t b1 = vget_high_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); - r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7); #else - r.f32[0] = a.f32[2]; - r.f32[1] = b.f32[2]; - r.f32[2] = a.f32[3]; - r.f32[3] = b.f32[3]; + r_.f32[0] = a_.f32[2]; + r_.f32[1] = b_.f32[2]; + r_.f32[2] = a_.f32[3]; + r_.f32[3] = b_.f32[3]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_unpacklo_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_unpacklo_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - float32x2_t a1 = vget_low_f32(a.neon_f32); - float32x2_t b1 = vget_low_f32(b.neon_f32); +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x2_t a1 = vget_low_f32(a_.neon_f32); + float32x2_t b1 = vget_low_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); - r.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); #else - r.f32[0] = a.f32[0]; - r.f32[1] = b.f32[0]; - r.f32[2] = a.f32[1]; - r.f32[3] = b.f32[1]; + r_.f32[0] = a_.f32[0]; + r_.f32[1] = b_.f32[0]; + r_.f32[2] = a_.f32[1]; + r_.f32[3] = b_.f32[1]; #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b) { - simde__m128 r; +#if defined(SIMDE_X86_SSE_NATIVE) + return _mm_xor_ps(a, b); +#else + simde__m128_private r_, a_ = simde__m128_to_private(a), + b_ = simde__m128_to_private(b); -#if defined(SIMDE_SSE_NATIVE) - r.n = _mm_xor_ps(a.n, b.n); -#elif defined(SIMDE_SSE_NEON) - r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f ^ b_.i32f; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { - r.u32[i] = a.u32[i] ^ b.u32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] ^ b_.u32[i]; } #endif - return r; + return simde__m128_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a) { -#if defined(SIMDE_SSE_NATIVE) - _mm_stream_pi(&(mem_addr->n), a.n); +#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a); +#else + simde__m64_private *dest = HEDLEY_REINTERPRET_CAST(simde__m64_private *, + mem_addr), + a_ = simde__m64_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + dest->i64[0] = vget_lane_s64(a_.neon_i64, 0); #else - mem_addr->i64[0] = a.i64[0]; + dest->i64[0] = a_.i64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE_NATIVE) - _mm_stream_ps(mem_addr, a.n); +#if defined(SIMDE_X86_SSE_NATIVE) + _mm_stream_ps(mem_addr, a); +#else + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_f32(SIMDE_ASSUME_ALIGNED(16, mem_addr), a_.neon_f32); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_)); +#endif #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_stream_ps(mem_addr, a) \ + simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST( \ + float *, simde_float32 *, mem_addr), \ + (a)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_mm_getcsr(void) { -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) return _mm_getcsr(); #else uint32_t r = 0; + +#if defined(SIMDE_HAVE_FENV_H) int rounding_mode = fegetround(); switch (rounding_mode) { +#if defined(FE_TONEAREST) case FE_TONEAREST: break; +#endif +#if defined(FE_UPWARD) case FE_UPWARD: r |= 2 << 13; break; +#endif +#if defined(FE_DOWNWARD) case FE_DOWNWARD: r |= 1 << 13; break; +#endif +#if defined(FE_TOWARDZERO) case FE_TOWARDZERO: r = 3 << 13; break; +#endif } +#else + HEDLEY_UNREACHABLE(); +#endif return r; #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_getcsr() simde_mm_getcsr() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_setcsr(uint32_t a) { -#if defined(SIMDE_SSE_NATIVE) +#if defined(SIMDE_X86_SSE_NATIVE) _mm_setcsr(a); #else switch ((a >> 13) & 3) { +#if defined(FE_TONEAREST) case 0: fesetround(FE_TONEAREST); +#endif +#if defined(FE_DOWNWARD) break; case 1: fesetround(FE_DOWNWARD); +#endif +#if defined(FE_UPWARD) break; case 2: fesetround(FE_UPWARD); +#endif +#if defined(FE_TOWARDZERO) break; case 3: fesetround(FE_TOWARDZERO); break; +#endif } #endif } +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _mm_setcsr(a) simde_mm_setcsr(a) +#endif #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ @@ -2586,6 +3612,107 @@ void simde_mm_setcsr(uint32_t a) row3 = simde_mm_movehl_ps(tmp3, tmp1); \ } while (0) -SIMDE__END_DECLS +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) +#endif + +#if defined(_MM_EXCEPT_INVALID) +#define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID +#else +#define SIMDE_MM_EXCEPT_INVALID (0x0001) +#endif +#if defined(_MM_EXCEPT_DENORM) +#define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM +#else +#define SIMDE_MM_EXCEPT_DENORM (0x0002) +#endif +#if defined(_MM_EXCEPT_DIV_ZERO) +#define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO +#else +#define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004) +#endif +#if defined(_MM_EXCEPT_OVERFLOW) +#define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW +#else +#define SIMDE_MM_EXCEPT_OVERFLOW (0x0008) +#endif +#if defined(_MM_EXCEPT_UNDERFLOW) +#define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW +#else +#define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010) +#endif +#if defined(_MM_EXCEPT_INEXACT) +#define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT +#else +#define SIMDE_MM_EXCEPT_INEXACT (0x0020) +#endif +#if defined(_MM_EXCEPT_MASK) +#define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK +#else +#define SIMDE_MM_EXCEPT_MASK \ + (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \ + SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \ + SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT) +#endif + +#if defined(_MM_MASK_INVALID) +#define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID +#else +#define SIMDE_MM_MASK_INVALID (0x0080) +#endif +#if defined(_MM_MASK_DENORM) +#define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM +#else +#define SIMDE_MM_MASK_DENORM (0x0100) +#endif +#if defined(_MM_MASK_DIV_ZERO) +#define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO +#else +#define SIMDE_MM_MASK_DIV_ZERO (0x0200) +#endif +#if defined(_MM_MASK_OVERFLOW) +#define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW +#else +#define SIMDE_MM_MASK_OVERFLOW (0x0400) +#endif +#if defined(_MM_MASK_UNDERFLOW) +#define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW +#else +#define SIMDE_MM_MASK_UNDERFLOW (0x0800) +#endif +#if defined(_MM_MASK_INEXACT) +#define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT +#else +#define SIMDE_MM_MASK_INEXACT (0x1000) +#endif +#if defined(_MM_MASK_MASK) +#define SIMDE_MM_MASK_MASK _MM_MASK_MASK +#else +#define SIMDE_MM_MASK_MASK \ + (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \ + SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \ + SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT) +#endif + +#if defined(_MM_FLUSH_ZERO_MASK) +#define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK +#else +#define SIMDE_MM_FLUSH_ZERO_MASK (0x8000) +#endif +#if defined(_MM_FLUSH_ZERO_ON) +#define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON +#else +#define SIMDE_MM_FLUSH_ZERO_ON (0x8000) +#endif +#if defined(_MM_FLUSH_ZERO_OFF) +#define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF +#else +#define SIMDE_MM_FLUSH_ZERO_OFF (0x0000) +#endif + +SIMDE_END_DECLS_ + +HEDLEY_DIAGNOSTIC_POP -#endif /* !defined(SIMDE__SSE_H) */ +#endif /* !defined(SIMDE_X86_SSE_H) */ diff --git a/libobs/util/simde/sse2.h b/libobs/util/simde/sse2.h index caad0a4ed075e9ef61210320a8801be784f280b9..651edda1573aee43779ae9bc57b30aea9f89a8e0 100644 --- a/libobs/util/simde/sse2.h +++ b/libobs/util/simde/sse2.h @@ -1,4 +1,6 @@ -/* Permission is hereby granted, free of charge, to any person +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, @@ -19,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2017 Evan Nemerson + * 2017-2020 Evan Nemerson * 2015-2017 John W. Ratcliff * 2015 Brandon Rowlett * 2015 Ken Fast @@ -27,1858 +29,2994 @@ * 2018 Jeff Daily */ -#if !defined(SIMDE__SSE2_H) -#if !defined(SIMDE__SSE2_H) -#define SIMDE__SSE2_H -#endif +#if !defined(SIMDE_X86_SSE2_H) +#define SIMDE_X86_SSE2_H + #include "sse.h" -#if defined(SIMDE_SSE2_NATIVE) -#undef SIMDE_SSE2_NATIVE -#endif -#if defined(SIMDE_SSE2_FORCE_NATIVE) -#define SIMDE_SSE2_NATIVE -#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \ - !defined(SIMDE_NO_NATIVE) -#define SIMDE_SSE2_NATIVE -#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \ - !defined(SIMDE_NO_NEON) -#define SIMDE_SSE2_NEON -#endif - -#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE) -#if defined(SIMDE_SSE2_FORCE_NATIVE) -#error Native SSE2 support requires native SSE support -#else -#warning Native SSE2 support requires native SSE support, disabling -#undef SIMDE_SSE2_NATIVE -#endif -#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON) -#warning SSE2 NEON support requires SSE NEON support, disabling -#undef SIMDE_SSE_NEON -#endif - -#if defined(SIMDE_SSE2_NATIVE) -#include -#else -#if defined(SIMDE_SSE2_NEON) -#include -#endif -#endif - -#include -#include -#include - -#define vreinterpretq_m128i_s32(v) \ - (simde__m128i) { .neon_i32 = v } -#define vreinterpretq_m128i_u64(v) \ - (simde__m128i) { .neon_u64 = v } - -#define vreinterpretq_s32_m128i(a) a.neon_i32 -#define vreinterpretq_u64_m128i(a) a.neon_u64 - -SIMDE__BEGIN_DECLS - -typedef SIMDE_ALIGN(16) union { -#if defined(SIMDE__ENABLE_GCC_VEC_EXT) - int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); - int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); - int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); - int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); - uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); - uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); - uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); - uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); -#if defined(SIMDE__HAVE_INT128) - simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__)); - simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__)); -#endif - simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); - simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); -#else - int8_t i8[16]; - int16_t i16[8]; - int32_t i32[4]; - int64_t i64[2]; - uint8_t u8[16]; - uint16_t u16[8]; - uint32_t u32[4]; - uint64_t u64[2]; -#if defined(SIMDE__HAVE_INT128) - simde_int128 i128[1]; - simde_uint128 u128[1]; -#endif - simde_float32 f32[4]; - simde_float64 f64[2]; -#endif - -#if defined(SIMDE_SSE2_NATIVE) - __m128i n; -#elif defined(SIMDE_SSE2_NEON) - int8x16_t neon_i8; - int16x8_t neon_i16; - int32x4_t neon_i32; - int64x2_t neon_i64; - uint8x16_t neon_u8; - uint16x8_t neon_u16; - uint32x4_t neon_u32; - uint64x2_t neon_u64; - float32x4_t neon_f32; -#if defined(SIMDE_ARCH_AMD64) - float64x2_t neon_f64; -#endif -#endif -} simde__m128i; - -typedef SIMDE_ALIGN(16) union { -#if defined(SIMDE__ENABLE_GCC_VEC_EXT) - int8_t i8 __attribute__((__vector_size__(16), __may_alias__)); - int16_t i16 __attribute__((__vector_size__(16), __may_alias__)); - int32_t i32 __attribute__((__vector_size__(16), __may_alias__)); - int64_t i64 __attribute__((__vector_size__(16), __may_alias__)); - uint8_t u8 __attribute__((__vector_size__(16), __may_alias__)); - uint16_t u16 __attribute__((__vector_size__(16), __may_alias__)); - uint32_t u32 __attribute__((__vector_size__(16), __may_alias__)); - uint64_t u64 __attribute__((__vector_size__(16), __may_alias__)); - simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__)); - simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__)); -#else - int8_t i8[16]; - int16_t i16[8]; - int32_t i32[4]; - int64_t i64[2]; - uint8_t u8[16]; - uint16_t u16[8]; - uint32_t u32[4]; - uint64_t u64[2]; - simde_float32 f32[4]; - simde_float64 f64[2]; -#endif - -#if defined(SIMDE_SSE2_NATIVE) - __m128d n; -#elif defined(SIMDE_SSE2_NEON) - int8x16_t neon_i8; - int16x8_t neon_i16; - int32x4_t neon_i32; - int64x2_t neon_i64; - uint8x16_t neon_u8; - uint16x8_t neon_u16; - uint32x4_t neon_u32; - uint64x2_t neon_u64; - float32x4_t neon_f32; -#if defined(SIMDE_ARCH_AMD64) - float64x2_t neon_f64; -#endif -#endif -} simde__m128d; - -#if defined(SIMDE_SSE2_NATIVE) -HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i), - "__m128i size doesn't match simde__m128i size"); -HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d), - "__m128d size doesn't match simde__m128d size"); -SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v) +#if !defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES +#endif + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +typedef union { +#if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#endif + SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + + SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; +#if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN(16) simde_int128 i128[1]; + SIMDE_ALIGN(16) simde_uint128 u128[1]; +#endif + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) simde_float64 f64[2]; + + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64_private m64_private[2]; + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_X86_SSE2_NATIVE) + SIMDE_ALIGN(16) __m128i n; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; +#if defined(SIMDE_ARCH_AARCH64) + SIMDE_ALIGN(16) float64x2_t neon_f64; +#endif +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + SIMDE_ALIGN(16) v128_t wasm_v128; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; +#if defined(__UINT_FAST32_TYPE__) + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; +#else + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; +#endif + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; +#if defined(__UINT_FAST32_TYPE__) + SIMDE_ALIGN(16) vector __UINT_FAST32_TYPE__ altivec_u32f; +#else + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; +#endif + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; +#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; +#endif +#endif +} simde__m128i_private; + +typedef union { +#if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else + SIMDE_ALIGN(16) int8_t i8[16]; + SIMDE_ALIGN(16) int16_t i16[8]; + SIMDE_ALIGN(16) int32_t i32[4]; + SIMDE_ALIGN(16) int64_t i64[2]; + SIMDE_ALIGN(16) uint8_t u8[16]; + SIMDE_ALIGN(16) uint16_t u16[8]; + SIMDE_ALIGN(16) uint32_t u32[4]; + SIMDE_ALIGN(16) uint64_t u64[2]; + SIMDE_ALIGN(16) simde_float32 f32[4]; + SIMDE_ALIGN(16) simde_float64 f64[2]; + SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; +#endif + + SIMDE_ALIGN(16) simde__m64_private m64_private[2]; + SIMDE_ALIGN(16) simde__m64 m64[2]; + +#if defined(SIMDE_X86_SSE2_NATIVE) + SIMDE_ALIGN(16) __m128d n; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN(16) int8x16_t neon_i8; + SIMDE_ALIGN(16) int16x8_t neon_i16; + SIMDE_ALIGN(16) int32x4_t neon_i32; + SIMDE_ALIGN(16) int64x2_t neon_i64; + SIMDE_ALIGN(16) uint8x16_t neon_u8; + SIMDE_ALIGN(16) uint16x8_t neon_u16; + SIMDE_ALIGN(16) uint32x4_t neon_u32; + SIMDE_ALIGN(16) uint64x2_t neon_u64; + SIMDE_ALIGN(16) float32x4_t neon_f32; +#if defined(SIMDE_ARCH_AARCH64) + SIMDE_ALIGN(16) float64x2_t neon_f64; +#endif +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + SIMDE_ALIGN(16) v128_t wasm_v128; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; +#if defined(__INT_FAST32_TYPE__) + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; +#else + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; +#endif + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; +#if defined(__UINT_FAST32_TYPE__) + SIMDE_ALIGN(16) vector __UINT_FAST32_TYPE__ altivec_u32f; +#else + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; +#endif + SIMDE_ALIGN(16) + SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; +#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; +#endif +#endif +} simde__m128d_private; + +#if defined(SIMDE_X86_SSE2_NATIVE) +typedef __m128i simde__m128i; +typedef __m128d simde__m128d; +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +typedef int64x2_t simde__m128i; +#if defined(SIMDE_ARCH_AARCH64) +typedef float64x2_t simde__m128d; +#elif defined(SIMDE_VECTOR_SUBSCRIPT) +typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else +typedef simde__m128d_private simde__m128d; +#endif +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +typedef v128_t simde__m128i; +typedef v128_t simde__m128d; +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i; +typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d; +#elif defined(SIMDE_VECTOR_SUBSCRIPT) +typedef int_fast32_t simde__m128i SIMDE_ALIGN(16) + SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +typedef simde_float64 simde__m128d SIMDE_ALIGN(16) + SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; +#else +typedef simde__m128i_private simde__m128i; +typedef simde__m128d_private simde__m128d; +#endif + +#if !defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES +typedef simde__m128i __m128i; +typedef simde__m128d __m128d; +#endif + +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), + "simde__m128i_private size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), + "simde__m128d_private size incorrect"); +#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, + "simde__m128i is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, + "simde__m128i_private is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, + "simde__m128d is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, + "simde__m128d_private is not 16-byte aligned"); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde__m128i_from_private(simde__m128i_private v) { simde__m128i r; - r.n = v; + simde_memcpy(&r, &v, sizeof(r)); return r; } -SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v) + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i_private simde__m128i_to_private(simde__m128i v) +{ + simde__m128i_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d simde__m128d_from_private(simde__m128d_private v) { simde__m128d r; - r.n = v; + simde_memcpy(&r, &v, sizeof(r)); return r; } -#elif defined(SIMDE_SSE_NEON) -#define SIMDE__M128I_NEON_C(T, expr) \ - (simde__m128i) { .neon_##T = expr } -#define SIMDE__M128D_NEON_C(T, expr) \ - (simde__m128d) { .neon_##T = expr } + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d_private simde__m128d_to_private(simde__m128d v) +{ + simde__m128d_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64) #endif -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); +#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) +SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64) +#endif +#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_epi8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = a.i8[i] + b.i8[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = a_.i8 + b_.i8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = a_.i8[i] + b_.i8[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_epi16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = a.i16[i] + b.i16[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = a_.i16 + b_.i16; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] + b_.i16[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_epi32(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] + b.i32[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 + b_.i32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] + b_.i32[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_epi64(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] + b.i64[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 + b_.i64; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a_.i64[i] + b_.i64[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_add_pd(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64) - return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = a.f64[i] + b.f64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 + b_.f64; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = a_.f64[i] + b_.f64[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_pd(a, b) simde_mm_add_pd(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b) +{ +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_move_sd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = + vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) + m = {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}; + r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1); +#else + r_.f64[0] = b_.f64[0]; + r_.f64[1] = a_.f64[1]; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_move_sd(a, b) simde_mm_move_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_add_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_add_sd(a, b); #else - simde__m128d r; - r.f64[0] = a.f64[0] + b.f64[0]; - r.f64[1] = a.f64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.f64[0] = a_.f64[0] + b_.f64[0]; + r_.f64[1] = a_.f64[1]; + +#if defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_add_pd(a, b)); +#else + r_.f64[0] = a_.f64[0] + b_.f64[0]; + r_.f64[1] = a_.f64[1]; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_sd(a, b) simde_mm_add_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M64_C(_mm_add_si64(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64)); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_add_si64(a, b); #else - simde__m64 r; - r.i64[0] = a.i64[0] + b.i64[0]; - return r; + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64); +#else + r_.i64[0] = a_.i64[0] + b_.i64[0]; +#endif + + return simde__m64_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_add_si64(a, b) simde_mm_add_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8)); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) { - r.i8[i] = INT8_MAX; - } else if ((((b.i8[i]) < 0) && - ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) { - r.i8[i] = INT8_MIN; - } else { - r.i8[i] = (a.i8[i]) + (b.i8[i]); - } +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_adds_epi8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + const int32_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) + + HEDLEY_STATIC_CAST(int16_t, b_.i8[i]); + r_.i8[i] = HEDLEY_STATIC_CAST( + int8_t, + ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN) + : INT8_MAX)); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16)); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - if ((((b.i16[i]) > 0) && - ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) { - r.i16[i] = INT16_MAX; - } else if ((((b.i16[i]) < 0) && - ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) { - r.i16[i] = INT16_MIN; - } else { - r.i16[i] = (a.i16[i]) + (b.i16[i]); - } +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_adds_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + const int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) + + HEDLEY_STATIC_CAST(int32_t, b_.i16[i]); + r_.i16[i] = HEDLEY_STATIC_CAST( + int16_t, + ((tmp < INT16_MAX) + ? ((tmp > INT16_MIN) ? tmp : INT16_MIN) + : INT16_MAX)); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_adds_epu8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i]) - ? (a.u8[i] + b.u8[i]) - : UINT8_MAX; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i]) + ? (a_.u8[i] + b_.u8[i]) + : UINT8_MAX; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_adds_epu16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i]) - ? (a.u16[i] + b.u16[i]) - : UINT16_MAX; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i]) + ? (a_.u16[i] + b_.u16[i]) + : UINT16_MAX; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_and_pd(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_and_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { - r.u64[i] = a.u64[i] & b.u64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f & b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_and_pd(a, b) simde_mm_and_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_and_si128(a.n, b.n)); -#elif defined(SIMDE_SSE_NEON) - return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_and_si128(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] & b.i64[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f & b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_and_si128(a, b) simde_mm_and_si128(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_andnot_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { - r.u64[i] = ~a.u64[i] & b.u64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = ~a_.i32f & b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) { + r_.u64[i] = ~a_.u64[i] & b_.u64[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_andnot_si128(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = ~(a.i64[i]) & b.i64[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = ~a_.i32f & b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8)); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_avg_epu8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \ + defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \ + defined(SIMDE_CONVERT_VECTOR_) + uint16_t wa SIMDE_VECTOR(32); + uint16_t wb SIMDE_VECTOR(32); + uint16_t wr SIMDE_VECTOR(32); + SIMDE_CONVERT_VECTOR_(wa, a_.u8); + SIMDE_CONVERT_VECTOR_(wb, b_.u8); + wr = (wa + wb + 1) >> 1; + SIMDE_CONVERT_VECTOR_(r_.u8, wr); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_avg_epu16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \ + defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \ + defined(SIMDE_CONVERT_VECTOR_) + uint32_t wa SIMDE_VECTOR(32); + uint32_t wb SIMDE_VECTOR(32); + uint32_t wr SIMDE_VECTOR(32); + SIMDE_CONVERT_VECTOR_(wa, a_.u16); + SIMDE_CONVERT_VECTOR_(wb, b_.u16); + wr = (wa + wb + 1) >> 1; + SIMDE_CONVERT_VECTOR_(r_.u16, wr); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; + } +#endif + + return simde__m128i_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_setzero_si128(void) +{ +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setzero_si128(); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1; + simde__m128i_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vdupq_n_s32(0); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = 0; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setzero_si128() (simde_mm_setzero_si128()) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - if (HEDLEY_UNLIKELY(imm8 > 15)) { - r.u64[0] = 0; - r.u64[1] = 0; - return r; + if (HEDLEY_UNLIKELY((imm8 & ~15))) { + return simde_mm_setzero_si128(); } - const int s = imm8 * 8; - -#if defined(SIMDE__HAVE_INT128) - r.u128[0] = a.u128[0] << s; +#if defined(SIMDE_HAVE_INT128_) && defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && 0 + r_.u128[0] = a_.u128[0] << s; #else - if (s < 64) { - r.u64[0] = (a.u64[0] << s); - r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s)); - } else { - r.u64[0] = 0; - r.u64[1] = a.u64[0] << (s - 64); + r_ = simde__m128i_to_private(simde_mm_setzero_si128()); + for (int i = imm8; + i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])); + i++) { + r_.i8[i] = a_.i8[i - imm8]; } #endif - return r; + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) -#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8)) -#elif defined(SIMDE_SSE2_NEON) +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bslli_si128(a, imm8) \ - SIMDE__M128I_NEON_C( \ - i8, \ - (((imm8) <= 0) ? ((a).neon_i8) \ - : (((imm8) > 15) ? (vdupq_n_s8(0)) \ - : (vextq_s8(vdupq_n_s8(0), \ - (a).neon_i8, \ - 16 - (imm8)))))) + simde__m128i_from_neon_i8( \ + ((imm8) <= 0) \ + ? simde__m128i_to_neon_i8(a) \ + : (((imm8) > 15) \ + ? (vdupq_n_s8(0)) \ + : (vextq_s8(vdupq_n_s8(0), \ + simde__m128i_to_neon_i8(a), \ + 16 - (imm8))))) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_bslli_si128(a, imm8) \ + (__extension__({ \ + const simde__m128i_private simde__tmp_a_ = \ + simde__m128i_to_private(a); \ + const simde__m128i_private simde__tmp_z_ = \ + simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde__tmp_r_; \ + if (HEDLEY_UNLIKELY(imm8 > 15)) { \ + simde__tmp_r_ = simde__m128i_to_private( \ + simde_mm_setzero_si128()); \ + } else { \ + simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_( \ + 8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8, \ + HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \ + } \ + simde__m128i_from_private(simde__tmp_r_); \ + })) #endif #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_bslli_si128(a, b) simde_mm_bslli_si128(a, b) +#define _mm_slli_si128(a, b) simde_mm_bslli_si128(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - if (HEDLEY_UNLIKELY(imm8 > 15)) { - r.u64[0] = 0; - r.u64[1] = 0; - return r; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + const int e = HEDLEY_STATIC_CAST(int, i) + imm8; + r_.i8[i] = (e < 16) ? a_.i8[e] : 0; } - const int s = imm8 * 8; - -#if defined(SIMDE__HAVE_INT128) - r.u128[0] = a.u128[0] >> s; -#else - if (s < 64) { - r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s)); - r.u64[1] = (a.u64[1] >> s); - } else { - r.u64[0] = a.u64[1] >> (s - 64); - r.u64[1] = 0; - } -#endif - - return r; -} -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) -#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8)) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_bsrli_si128(a, imm8) \ - SIMDE__M128I_NEON_C( \ - i8, \ - ((imm8) <= 0) \ - ? ((a).neon_i8) \ - : (((imm8) > 15) ? (vdupq_n_s8(0)) \ - : (vextq_s8((a).neon_i8, \ - vdupq_n_s8(0), (imm8))))) -#endif -#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8) - -SIMDE__FUNCTION_ATTRIBUTES + return simde__m128i_from_private(r_); +} +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) +#define simde_mm_bsrli_si128(a, imm8) \ + simde__m128i_from_neon_i8( \ + ((imm8 < 0) || (imm8 > 15)) \ + ? vdupq_n_s8(0) \ + : (vextq_s8(simde__m128i_to_private(a).neon_i8, \ + vdupq_n_s8(0), \ + ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_bsrli_si128(a, imm8) \ + (__extension__({ \ + const simde__m128i_private simde__tmp_a_ = \ + simde__m128i_to_private(a); \ + const simde__m128i_private simde__tmp_z_ = \ + simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde__tmp_r_ = \ + simde__m128i_to_private(a); \ + if (HEDLEY_UNLIKELY(imm8 > 15)) { \ + simde__tmp_r_ = simde__m128i_to_private( \ + simde_mm_setzero_si128()); \ + } else { \ + simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_( \ + 8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8, \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ + } \ + simde__m128i_from_private(simde__tmp_r_); \ + })) +#endif +#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) +#define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_clflush(void const *p) { -#if defined(SIMDE_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) _mm_clflush(p); #else (void)p; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_clflush(a, b) simde_mm_clflush() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comieq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comieq_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comieq_sd(a, b); #else - return a.f64[0] == b.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); +#else + return a_.f64[0] == b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comige_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comige_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comige_sd(a, b); #else - return a.f64[0] >= b.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0); +#else + return a_.f64[0] >= b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comigt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comigt_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comigt_sd(a, b); +#else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0); #else - return a.f64[0] > b.f64[0]; + return a_.f64[0] > b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comile_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comile_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comile_sd(a, b); +#else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0); #else - return a.f64[0] <= b.f64[0]; + return a_.f64[0] <= b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comilt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comilt_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comilt_sd(a, b); +#else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0); #else - return a.f64[0] < b.f64[0]; + return a_.f64[0] < b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_comineq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_comineq_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_comineq_sd(a, b); +#else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); #else - return a.f64[0] != b.f64[0]; + return a_.f64[0] != b_.f64[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_castpd_ps(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128_C(_mm_castpd_ps(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castpd_ps(a); #else - union { - simde__m128d pd; - simde__m128 ps; - } r; - r.pd = a; - return r.ps; + simde__m128 r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castpd_ps(a) simde_mm_castpd_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_castpd_si128(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_castpd_si128(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castpd_si128(a); #else - union { - simde__m128d pd; - simde__m128i si128; - } r; - r.pd = a; - return r.si128; + simde__m128i r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castpd_si128(a) simde_mm_castpd_si128(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_castps_pd(simde__m128 a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_castps_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castps_pd(a); #else - union { - simde__m128 ps; - simde__m128d pd; - } r; - r.ps = a; - return r.pd; + simde__m128d r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castps_pd(a) simde_mm_castps_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_castps_si128(simde__m128 a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_castps_si128(a.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, a.neon_i32); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castps_si128(a); #else - union { - simde__m128 ps; - simde__m128i si128; - } r; - r.ps = a; - return r.si128; + simde__m128i r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castps_si128(a) simde_mm_castps_si128(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_castsi128_pd(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_castsi128_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castsi128_pd(a); #else - union { - simde__m128i si128; - simde__m128d pd; - } r; - r.si128 = a; - return r.pd; + simde__m128d r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_castsi128_ps(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128_C(_mm_castsi128_ps(a.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128_NEON_C(f32, a.neon_f32); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_castsi128_ps(a); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + return a; #else - union { - simde__m128i si128; - simde__m128 ps; - } r; - r.si128 = a; - return r.ps; + simde__m128 r; + simde_memcpy(&r, &a, sizeof(a)); + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpeq_epi8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vreinterpretq_s8_u8(vceqq_s8(b_.neon_i8, a_.neon_i8)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i8 = (SIMDE_POWER_ALTIVEC_VECTOR(signed char))vec_cmpeq( + a_.altivec_i8, b_.altivec_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpeq_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = + vreinterpretq_s16_u16(vceqq_s16(b_.neon_i16, a_.neon_i16)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = (SIMDE_POWER_ALTIVEC_VECTOR(signed short))vec_cmpeq( + a_.altivec_i16, b_.altivec_i16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = (a_.i16 == b_.i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpeq_epi32(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = + vreinterpretq_s32_u32(vceqq_s32(b_.neon_i32, a_.neon_i32)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = (SIMDE_POWER_ALTIVEC_VECTOR(signed int))vec_cmpeq( + a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128D_NEON_C( - i32, vreinterpretq_s32_u32( - vceqq_s32(vreinterpretq_s32_f32(b.neon_f32), - vreinterpretq_s32_f32(a.neon_f32)))); -#else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpeq_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_i64 = vreinterpretq_s64_u64( + vceqq_s64(vreinterpretq_s64_f64(b_.neon_f64), + vreinterpretq_s64_f64(a_.neon_f64))); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = (SIMDE_POWER_ALTIVEC_VECTOR(double))vec_cmpeq( + a_.altivec_f64, b_.altivec_f64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpeq_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0; - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128D_NEON_C(f32, - vreinterpretq_f32_u16(vmvnq_u16( - vceqq_s16(b.neon_i16, a.neon_i16)))); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpneq_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vreinterpretq_f32_u16( + vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16))); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpneq_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmplt_epi8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vreinterpretq_s8_u8(vcltq_s8(a_.neon_i8, b_.neon_i8)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i8 = HEDLEY_REINTERPRET_CAST( + SIMDE_POWER_ALTIVEC_VECTOR(signed char), + vec_cmplt(a_.altivec_i8, b_.altivec_i8)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmplt_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = + vreinterpretq_s16_u16(vcltq_s16(a_.neon_i16, b_.neon_i16)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = HEDLEY_REINTERPRET_CAST( + SIMDE_POWER_ALTIVEC_VECTOR(signed short), + vec_cmplt(a_.altivec_i16, b_.altivec_i16)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmplt_epi32(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = + vreinterpretq_s32_u32(vcltq_s32(a_.neon_i32, b_.neon_i32)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = HEDLEY_REINTERPRET_CAST( + SIMDE_POWER_ALTIVEC_VECTOR(signed int), + vec_cmplt(a_.altivec_i32, b_.altivec_i32)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmplt_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmplt_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmple_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = (SIMDE_POWER_ALTIVEC_VECTOR(double))vec_cmple( + a_.altivec_f64, b_.altivec_f64); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmple_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpgt_epi8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vreinterpretq_s8_u8(vcgtq_s8(a_.neon_i8, b_.neon_i8)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i8 = (SIMDE_POWER_ALTIVEC_VECTOR(signed char))vec_cmpgt( + a_.altivec_i8, b_.altivec_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpgt_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = + vreinterpretq_s16_u16(vcgtq_s16(a_.neon_i16, b_.neon_i16)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = HEDLEY_REINTERPRET_CAST( + SIMDE_POWER_ALTIVEC_VECTOR(signed short), + vec_cmpgt(a_.altivec_i16, b_.altivec_i16)); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C( - i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32))); -#else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpgt_epi32(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = + vreinterpretq_s32_u32(vcgtq_s32(a_.neon_i32, b_.neon_i32)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = (SIMDE_POWER_ALTIVEC_VECTOR(signed int))vec_cmpgt( + a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n)); -#else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpgt_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = + HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), + vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) + return _mm_cmpgt_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n)); -#else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpge_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = + HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), + vec_cmpge(a_.altivec_f64, b_.altivec_f64)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) + return _mm_cmpge_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpnge_pd(a, b); #else return simde_mm_cmplt_pd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) + return _mm_cmpnge_sd(a, b); #else return simde_mm_cmplt_sd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpnlt_pd(a, b); #else return simde_mm_cmpge_pd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpnlt_sd(a, b); #else return simde_mm_cmpge_sd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpnle_pd(a, b); #else return simde_mm_cmpgt_pd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpnle_sd(a, b); #else return simde_mm_cmpgt_sd(a, b); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpord_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0) - : UINT64_C(0); + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(simde_math_isnan) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && + !simde_math_isnan(b_.f64[i])) + ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#else + HEDLEY_UNREACHABLE(); +#endif + + return simde__m128d_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 simde_mm_cvtsd_f64(simde__m128d a) +{ +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) + return _mm_cvtsd_f64(a); +#else + simde__m128d_private a_ = simde__m128d_to_private(a); + return a_.f64[0]; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpord_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0) - : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(simde_math_isnan) + r_.u64[0] = + (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) + ? ~UINT64_C(0) + : UINT64_C(0); + r_.u64[1] = a_.u64[1]; +#else + HEDLEY_UNREACHABLE(); +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpunord_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0) - : UINT64_C(0); + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(simde_math_isnan) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.u64[i] = (simde_math_isnan(a_.f64[i]) || + simde_math_isnan(b_.f64[i])) + ? ~UINT64_C(0) + : UINT64_C(0); } - return r; +#else + HEDLEY_UNREACHABLE(); +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cmpunord_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b)); #else - simde__m128d r; - r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0) - : UINT64_C(0); - r.u64[1] = a.u64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(simde_math_isnan) + r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) + ? ~UINT64_C(0) + : UINT64_C(0); + r_.u64[1] = a_.u64[1]; + +#else + HEDLEY_UNREACHABLE(); +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtepi32_pd(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtepi32_pd(a); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = (simde_float64)a.i32[i]; + simde__m128d_private r_; + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = (simde_float64)a_.i32[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtepi32_ps(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128_C(_mm_cvtepi32_ps(a.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtepi32_ps(a); #else - simde__m128 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) { - r.f32[i] = (simde_float32)a.i32[i]; + simde__m128_private r_; + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { + r_.f32[i] = (simde_float32)a_.i32[i]; } - return r; +#endif + + return simde__m128_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtpd_epi32(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtpd_epi32(a); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.i32[i] = (int32_t)a.f64[i]; + simde__m128i_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.m64_private[0].i32, a_.f64); + r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64()); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f64[i]); } - return r; + simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1])); +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtpd_pi32(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M64_C(_mm_cvtpd_pi32(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpd_pi32(a); +#else + simde__m64_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64); #else - simde__m64 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)a.f64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f64[i]); } - return r; +#endif + + return simde__m64_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpd_ps(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128_C(_mm_cvtpd_ps(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtpd_ps(a); #else - simde__m128 r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) { - r.f32[i] = (simde_float32)a.f64[i]; + simde__m128_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64); + r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64()); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) { + r_.f32[i] = (simde_float32)a_.f64[i]; } - return r; + simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1])); +#endif + + return simde__m128_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtpi32_pd(simde__m64 a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvtpi32_pd(a); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = (simde_float64)a.i32[i]; + simde__m128d_private r_; + simde__m64_private a_ = simde__m64_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = (simde_float64)a_.i32[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtps_epi32(simde__m128 a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_cvtps_epi32(a.n)); -#elif defined(SIMDE_SSE2_NEON) +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtps_epi32(a); +#else + simde__m128i_private r_; + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) /* The default rounding mode on SSE is 'round to even', which ArmV7 does not support! It is supported on ARMv8 however. */ #if defined(SIMDE_ARCH_AARCH64) - return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32)); + r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32); #else uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, a.neon_f32, + float32x4_t half = vbslq_f32(signmask, a_.neon_f32, vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32( - vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/ + vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = - vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */ + vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */ int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( - a.neon_f32, + a_.neon_f32, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return SIMDE__M128I_NEON_C(i32, - vbslq_s32(is_delta_half, r_even, r_normal)); + r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal); #endif +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_cts(a_.altivec_f32, 0); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)a.f32[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f32[i]); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtps_pd(simde__m128 a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cvtps_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtps_pd(a); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = a.f32[i]; + simde__m128d_private r_; + simde__m128_private a_ = simde__m128_to_private(a); + +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = a_.f32[i]; } - return r; #endif -} -SIMDE__FUNCTION_ATTRIBUTES -double simde_mm_cvtsd_f64(simde__m128d a) -{ -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - return _mm_cvtsd_f64(a.n); -#else - return a.f64[0]; + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsd_si32(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_cvtsd_si32(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtsd_si32(a); #else - return (int32_t)a.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a); + return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES -int32_t simde_mm_cvtsd_si64(simde__m128d a) +SIMDE_FUNCTION_ATTRIBUTES +int64_t simde_mm_cvtsd_si64(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) #if defined(__PGI) - return _mm_cvtsd_si64x(a.n); + return _mm_cvtsd_si64x(a); #else - return _mm_cvtsd_si64(a.n); + return _mm_cvtsd_si64(a); #endif #else - return (int32_t)a.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a); + return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); #endif } #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a) +#define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtsd_ss(a, b); #else - simde__m128 r; + simde__m128_private r_, a_ = simde__m128_to_private(a); + simde__m128d_private b_ = simde__m128d_to_private(b); - r.f32[0] = (simde_float32)b.f64[0]; + r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]); - SIMDE__VECTORIZE - for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i]; + SIMDE_VECTORIZE + for (size_t i = 1; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i]; } - return r; + return simde__m128_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsi128_si32(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_cvtsi128_si32(a.n); -#elif defined(SIMDE_SSE2_NEON) - return vgetq_lane_s32(a.neon_i32, 0); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtsi128_si32(a); +#else + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vgetq_lane_s32(a_.neon_i32, 0); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +#if defined(SIMDE_BUG_GCC_95227) + (void)a_; +#endif + return vec_extract(a_.altivec_i32, 0); #else - return a.i32[0]; + return a_.i32[0]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int64_t simde_mm_cvtsi128_si64(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) #if defined(__PGI) - return _mm_cvtsi128_si64x(a.n); + return _mm_cvtsi128_si64x(a); #else - return _mm_cvtsi128_si64(a.n); + return _mm_cvtsi128_si64(a); #endif #else - return a.i64[0]; + simde__m128i_private a_ = simde__m128i_to_private(a); +#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(HEDLEY_IBM_VERSION) + return vec_extract(a_.i64, 0); +#endif + return a_.i64[0]; #endif } #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a) +#define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b)); + +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtsi32_sd(a, b); #else - simde__m128d r; + simde__m128d_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); - r.f64[0] = (simde_float64)b; - r.i64[1] = a.i64[1]; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64) + r_.neon_f64 = vsetq_lane_f64((simde_float64)b, a_.neon_f64, 0); +#else + r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); + r_.i64[1] = a_.i64[1]; +#endif - return r; + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi32_si128(int32_t a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtsi32_si128(a); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_cvtsi32_si128(a); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); #else - r.i32[0] = a; - r.i32[1] = 0; - r.i32[2] = 0; - r.i32[3] = 0; + r_.i32[0] = a; + r_.i32[1] = 0; + r_.i32[2] = 0; + r_.i32[3] = 0; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int64_t b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) #if !defined(__PGI) - r.n = _mm_cvtsi64_sd(a.n, b); + return _mm_cvtsi64_sd(a, b); #else - r.n = _mm_cvtsi64x_sd(a.n, b); + return _mm_cvtsi64x_sd(a, b); #endif #else - r.f64[0] = (simde_float64)b; - r.f64[1] = a.f64[1]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vsetq_lane_f64((simde_float64)b, a_.neon_f64, 0); +#else + r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); + r_.f64[1] = a_.f64[1]; #endif - return r; + return simde__m128d_from_private(r_); +#endif } -#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b) +#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b) +#define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi64_si128(int64_t a) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) #if !defined(__PGI) - r.n = _mm_cvtsi64_si128(a); + return _mm_cvtsi64_si128(a); #else - r.n = _mm_cvtsi64x_si128(a); + return _mm_cvtsi64x_si128(a); #endif #else - r.i64[0] = a; - r.i64[1] = 0; -#endif + simde__m128i_private r_; - return r; + r_.i64[0] = a; + r_.i64[1] = 0; + + return simde__m128i_from_private(r_); +#endif } #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a) +#define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_cvtss_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvtss_sd(a, b); #else - r.f64[0] = b.f32[0]; - r.i64[1] = a.i64[1]; -#endif + simde__m128d_private a_ = simde__m128d_to_private(a); + simde__m128_private b_ = simde__m128_to_private(b); - return r; + a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); + + return simde__m128d_from_private(a_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttpd_epi32(simde__m128d a) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_cvttpd_epi32(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvttpd_epi32(a); #else - for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) { - r.i32[i] = (int32_t)trunc(a.f64[i]); + simde__m128i_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); + + for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]); } -#endif + simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1])); - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvttpd_pi32(simde__m128d a) { - simde__m64 r; +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_cvttpd_pi32(a); +#else + simde__m64_private r_; + simde__m128d_private a_ = simde__m128d_to_private(a); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_cvttpd_pi32(a.n); +#if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64); #else - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)trunc(a.f64[i]); + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]); } #endif - return r; + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttps_epi32(simde__m128 a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvttps_epi32(a); +#else + simde__m128i_private r_; + simde__m128_private a_ = simde__m128_to_private(a); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_cvttps_epi32(a.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vcvtq_s32_f32(a.neon_f32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32); +#elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #else - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = (int32_t)truncf(a.f32[i]); + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_cvttsd_si32(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_cvttsd_si32(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_cvttsd_si32(a); #else - return (int32_t)trunc(a.f64[0]); + simde__m128d_private a_ = simde__m128d_to_private(a); + return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int64_t simde_mm_cvttsd_si64(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) #if !defined(__PGI) - return _mm_cvttsd_si64(a.n); + return _mm_cvttsd_si64(a); #else - return _mm_cvttsd_si64x(a.n); + return _mm_cvttsd_si64x(a); #endif #else - return (int64_t)trunc(a.f64[0]); + simde__m128d_private a_ = simde__m128d_to_private(a); + return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); #endif } #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a) +#define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_div_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_div_pd(a.n, b.n); +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 / b_.f64; +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = a.f64[i] / b.f64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = a_.f64[i] / b_.f64[i]; } #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_div_pd(a, b) simde_mm_div_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_div_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_div_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_div_pd(a, b)); #else - r.f64[0] = a.f64[0] / b.f64[0]; - r.f64[1] = a.f64[1]; -#endif + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); - return r; + r_.f64[0] = a_.f64[0] / b_.f64[0]; + r_.f64[1] = a_.f64[1]; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_div_sd(a, b) simde_mm_div_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 7) { - return a.u16[imm8 & 7]; + uint16_t r; + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +#if defined(SIMDE_BUG_GCC_95227) + (void)a_; + (void)imm8; +#endif + r = vec_extract(a_.altivec_i16, imm8); +#else + r = a_.u16[imm8 & 7]; +#endif + + return HEDLEY_STATIC_CAST(int32_t, r); } -#if defined(SIMDE_SSE2_NATIVE) && \ - (!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0)) -#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_extract_epi16(a, imm8) \ - (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff))) +#if defined(SIMDE_X86_SSE2_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0)) +#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_extract_epi16(a, imm8) \ + HEDLEY_STATIC_CAST(int32_t, \ + vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, \ + (imm8)) & \ + (UINT32_C(0x0000ffff))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_insert_epi16(simde__m128i a, int16_t i, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 7) { - a.u16[imm8 & 7] = (int16_t)i; - return a; + simde__m128i_private a_ = simde__m128i_to_private(a); + a_.i16[imm8 & 7] = i; + return simde__m128i_from_private(a_); } -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8)) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_insert_epi16(a, i, imm8) \ - SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8))) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_insert_epi16(a, i, imm8) \ - SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8))) + simde__m128i_from_neon_i16( \ + vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - simde__m128d r; - simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_load_pd(mem_addr); -#elif defined(SIMDE_SSE2_NEON) - r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_load_pd(mem_addr); +#else + simde__m128d_private r_; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = + vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const *, mem_addr)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(HEDLEY_IBM_VERSION) + r_.altivec_f64 = vec_ld( + 0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double) + const *, + mem_addr)); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(&r, mem_addr, sizeof(r)); + r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const *, mem_addr); #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_load_pd1(mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_load1_pd(mem_addr); #else - r.f64[0] = *mem_addr; - r.f64[1] = *mem_addr; -#endif + simde__m128d_private r_; - return r; + r_.f64[0] = *mem_addr; + r_.f64[1] = *mem_addr; + + return simde__m128d_from_private(r_); +#endif } #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr) +#define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_load_sd(mem_addr); +#else + simde__m128d_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_load_sd(mem_addr); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); #else - memcpy(&r, mem_addr, sizeof(simde_float64)); - r.u64[1] = 0; + r_.f64[0] = *mem_addr; + r_.u64[1] = UINT64_C(0); #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr) { - simde__m128i r; - simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_load_si128(&(mem_addr->n)); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vld1q_s32((int32_t const *)mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_load_si128( + HEDLEY_REINTERPRET_CAST(__m128i const *, mem_addr)); +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m128i_private r_; + +#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_ld( + 0, HEDLEY_REINTERPRET_CAST( + SIMDE_POWER_ALTIVEC_VECTOR(int) const *, mem_addr)); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(&r, mem_addr, sizeof(r)); + r_.neon_i32 = vld1q_s32((int32_t const *)mem_addr); #endif - return r; + return simde__m128i_from_private(r_); +#else + return *mem_addr; +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadh_pd(a.n, mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadh_pd(a, mem_addr); #else + simde__m128d_private r_, a_ = simde__m128d_to_private(a); simde_float64 t; - memcpy(&t, mem_addr, sizeof(t)); - r.f64[0] = a.f64[0]; - r.f64[1] = t; -#endif - return r; + simde_memcpy(&t, mem_addr, sizeof(t)); + r_.f64[0] = a_.f64[0]; + r_.f64[1] = t; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr) { - simde__m128i r; + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadl_epi64( + HEDLEY_REINTERPRET_CAST(__m128i const *, mem_addr)); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadl_epi64(&mem_addr->n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr), - vcreate_s32(0)); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr), + vcreate_s32(0)); #else - r.u64[0] = mem_addr->u64[0]; - r.u64[1] = 0; + r_.i64[0] = *HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr); + r_.i64[1] = 0; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadl_pd(a.n, mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadl_pd(a, mem_addr); #else - memcpy(&r, mem_addr, sizeof(simde_float64)); - r.u64[1] = a.u64[1]; -#endif + simde__m128d_private r_, a_ = simde__m128d_to_private(a); - return r; + r_.f64[0] = *mem_addr; + r_.u64[1] = a_.u64[1]; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - simde__m128d r; - simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadr_pd(mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadr_pd(mem_addr); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - r.f64[0] = mem_addr[1]; - r.f64[1] = mem_addr[0]; -#endif + simde__m128d_private r_; - return r; + r_.f64[0] = mem_addr[1]; + r_.f64[1] = mem_addr[0]; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadu_pd(mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadu_pd(mem_addr); #else - simde_float64 l, h; - memcpy(&l, &mem_addr[0], sizeof(l)); - memcpy(&h, &mem_addr[1], sizeof(h)); - r.f64[0] = l; - r.f64[1] = h; -#endif + simde__m128d_private r_; - return r; + simde_memcpy(&r_, mem_addr, sizeof(r_)); + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const *, mem_addr)); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_loadu_si128(&((*mem_addr).n)); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vld1q_s32((int32_t const *)mem_addr); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vld1q_s32((int32_t const *)mem_addr); #else - memcpy(&r, mem_addr, sizeof(r)); + simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_madd_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_madd_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4_t pl = - vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16)); - int32x4_t ph = - vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16)); + vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); + int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), + vget_high_s16(b_.neon_i16)); int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); - r.neon_i32 = vcombine_s32(rl, rh); + r_.neon_i32 = vcombine_s32(rl, rh); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) { - r.i32[i / 2] = - (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]); + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i += 2) { + r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + + (a_.i16[i + 1] * b_.i16[i + 1]); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr)); #else - for (size_t i = 0; i < 16; i++) { - if (mask.u8[i] & 0x80) { - mem_addr[i] = a.i8[i]; + simde__m128i_private a_ = simde__m128i_to_private(a), + mask_ = simde__m128i_to_private(mask); + + for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++) { + if (mask_.u8[i] & 0x80) { + mem_addr[i] = a_.i8[i]; } } #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_maskmoveu_si128(a, mask, mem_addr) \ + simde_mm_maskmoveu_si128( \ + (a), (mask), \ + SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_epi8(simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_movemask_epi8(a.n); -#elif defined(SIMDE_SSE2_NEON) - uint8x16_t input = a.neon_u8; - SIMDE_ALIGN(16) +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER) + /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */ + return _mm_movemask_epi8(a); +#else + int32_t r = 0; + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t input = a_.neon_u8; + SIMDE_ALIGN_AS(16, int8x8_t) static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; uint8x8_t mask_and = vdup_n_u8(0x80); int8x8_t mask_shift = vld1_s8(xr); @@ -1900,2298 +3038,3177 @@ int32_t simde_mm_movemask_epi8(simde__m128i a) hi = vpadd_u8(hi, hi); hi = vpadd_u8(hi, hi); - return ((hi[0] << 8) | (lo[0] & 0xFF)); -#else - int32_t r = 0; - SIMDE__VECTORIZE_REDUCTION(| : r) - for (size_t i = 0; i < 16; i++) { - r |= (a.u8[15 - i] >> 7) << (15 - i); + r = ((hi[0] << 8) | (lo[0] & 0xFF)); +#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) + static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) + perm = {120, 112, 104, 96, 88, 80, 72, 64, + 56, 48, 40, 32, 24, 16, 8, 0}; + r = HEDLEY_STATIC_CAST( + int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1)); +#else + SIMDE_VECTORIZE_REDUCTION(| : r) + for (size_t i = 0; i < (sizeof(a_.u8) / sizeof(a_.u8[0])); i++) { + r |= (a_.u8[15 - i] >> 7) << (15 - i); } +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_pd(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_movemask_pd(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_movemask_pd(a); #else int32_t r = 0; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) { - r |= (a.u64[i] >> 63) << i; + simde__m128d_private a_ = simde__m128d_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(a_.u64) / sizeof(a_.u64[0])); i++) { + r |= (a_.u64[i] >> 63) << i; } + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_movemask_pd(a) simde_mm_movemask_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_movepi64_pi64(simde__m128i a) { - simde__m64 r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_movepi64_pi64(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_movepi64_pi64(a); #else - r.i64[0] = a.i64[0]; -#endif + simde__m64_private r_; + simde__m128i_private a_ = simde__m128i_to_private(a); - return r; + r_.i64[0] = a_.i64[0]; + + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_movpi64_epi64(simde__m64 a) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_movpi64_epi64(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_movpi64_epi64(a); #else - r.i64[0] = a.i64[0]; - r.i64[1] = 0; -#endif + simde__m128i_private r_; + simde__m64_private a_ = simde__m64_to_private(a); - return r; + r_.i64[0] = a_.i64[0]; + r_.i64[1] = 0; + + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_min_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_min_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_min_epu8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_min_epu8(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_min_pd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_min_pd(a, b); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i]; } -#endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_min_pd(a, b) simde_mm_min_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_min_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_min_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_min_pd(a, b)); #else - r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0]; - r.f64[1] = a.f64[1]; -#endif + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); - return r; + r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0]; + r_.f64[1] = a_.f64[1]; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_min_sd(a, b) simde_mm_min_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_max_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_max_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_max_epu8(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_max_epu8(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) { - r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) { + r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_max_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_max_pd(a.n, b.n); +#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i]; } #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_max_pd(a, b) simde_mm_max_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_max_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_max_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_max_pd(a, b)); #else - r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0]; - r.f64[1] = a.f64[1]; -#endif + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_move_epi64(simde__m128i a) -{ - simde__m128i r; + r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0]; + r_.f64[1] = a_.f64[1]; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_move_epi64(a.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1); -#else - r.i64[0] = a.i64[0]; - r.i64[1] = 0; + return simde__m128d_from_private(r_); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_max_sd(a, b) simde_mm_max_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_move_epi64(simde__m128i a) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_move_epi64(a); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_move_sd(a.n, b.n); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); #else - r.f64[0] = b.f64[0]; - r.f64[1] = a.f64[1]; + r_.i64[0] = a_.i64[0]; + r_.i64[1] = 0; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_move_epi64(a) simde_mm_move_epi64(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_mul_epu32(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_mul_epu32(a, b); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { - r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]); + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) { + r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * + HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]); } -#endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] * b.i64[i]; +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 * b_.i64; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a_.i64[i] * b_.i64[i]; } +#endif - return r; + return simde__m128i_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] % b.i64[i]; +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 % b_.i64; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a_.i64[i] % b_.i64[i]; } +#endif - return r; + return simde__m128i_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_mul_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_mul_pd(a.n, b.n); +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 * b_.f64; +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = a.f64[i] * b.f64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = a_.f64[i] * b_.f64[i]; } #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_mul_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_mul_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_mul_pd(a, b)); #else - r.f64[0] = a.f64[0] * b.f64[0]; - r.f64[1] = a.f64[1]; -#endif + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); - return r; + r_.f64[0] = a_.f64[0] * b_.f64[0]; + r_.f64[1] = a_.f64[1]; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b) { - simde__m64 r; - -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - r.n = _mm_mul_su32(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \ + !defined(__PGI) + return _mm_mul_su32(a, b); #else - r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]); -#endif + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); - return r; + r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * + HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]); + + return simde__m64_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_mulhi_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_mulhi_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - int16x4_t a3210 = vget_low_s16(a.neon_i16); - int16x4_t b3210 = vget_low_s16(b.neon_i16); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int16x4_t a3210 = vget_low_s16(a_.neon_i16); + int16x4_t b3210 = vget_low_s16(b_.neon_i16); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ - int16x4_t a7654 = vget_high_s16(a.neon_i16); - int16x4_t b7654 = vget_high_s16(b.neon_i16); + int16x4_t a7654 = vget_high_s16(a_.neon_i16); + int16x4_t b7654 = vget_high_s16(b_.neon_i16); int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); - r.neon_u16 = rv.val[1]; -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) * - ((int32_t)b.i16[i]))) >> - 16); + r_.neon_u16 = rv.val[1]; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST( + uint16_t, + (HEDLEY_STATIC_CAST( + uint32_t, + HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * + HEDLEY_STATIC_CAST(int32_t, + b_.i16[i])) >> + 16)); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) - r.n = _mm_mulhi_epu16(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) + return _mm_mulhi_epu16(a, b); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = (uint16_t)( - (((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16); + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST( + uint16_t, + HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * + HEDLEY_STATIC_CAST(uint32_t, + b_.u16[i]) >> + 16); } -#endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_mullo_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_mullo_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) * - ((int32_t)b.i16[i]))) & - 0xffff); + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST( + uint16_t, + HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * + HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_or_pd(a, b); +#else + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_or_pd(a.n, b.n); +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f | b_.i32f; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] | b.i64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; } #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_or_pd(a, b) simde_mm_or_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_or_si128(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_or_si128(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f | b_.i32f; #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] | b.i64[i]; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_or_si128(a, b) simde_mm_or_si128(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_packs_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16)); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i8[i] = (a.i16[i] > INT8_MAX) - ? INT8_MAX - : ((a.i16[i] < INT8_MIN) - ? INT8_MIN - : ((int8_t)a.i16[i])); - r.i8[i + 8] = (b.i16[i] > INT8_MAX) - ? INT8_MAX - : ((b.i16[i] < INT8_MIN) - ? INT8_MIN - : ((int8_t)b.i16[i])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_packs_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = + vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i8[i] = (a_.i16[i] > INT8_MAX) + ? INT8_MAX + : ((a_.i16[i] < INT8_MIN) + ? INT8_MIN + : HEDLEY_STATIC_CAST(int8_t, + a_.i16[i])); + r_.i8[i + 8] = (b_.i16[i] > INT8_MAX) + ? INT8_MAX + : ((b_.i16[i] < INT8_MIN) + ? INT8_MIN + : HEDLEY_STATIC_CAST( + int8_t, b_.i16[i])); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_packs_epi32(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i16 = - vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32)); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i16[i] = (a.i32[i] > INT16_MAX) - ? INT16_MAX - : ((a.i32[i] < INT16_MIN) - ? INT16_MIN - : ((int16_t)a.i32[i])); - r.i16[i + 4] = (b.i32[i] > INT16_MAX) - ? INT16_MAX - : ((b.i32[i] < INT16_MIN) - ? INT16_MIN - : ((int16_t)b.i32[i])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_packs_epi32(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = + vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i16[i] = (a_.i32[i] > INT16_MAX) + ? INT16_MAX + : ((a_.i32[i] < INT16_MIN) + ? INT16_MIN + : HEDLEY_STATIC_CAST(int16_t, + a_.i32[i])); + r_.i16[i + 4] = + (b_.i32[i] > INT16_MAX) + ? INT16_MAX + : ((b_.i32[i] < INT16_MIN) + ? INT16_MIN + : HEDLEY_STATIC_CAST(int16_t, + b_.i32[i])); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_packus_epi16(a.n, b.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_u8 = - vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16)); -#else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.u8[i] = (a.i16[i] > UINT8_MAX) - ? UINT8_MAX - : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i])); - r.u8[i + 8] = - (b.i16[i] > UINT8_MAX) +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_packus_epi16(a, b); +#else + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = + vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16)); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.u8[i] = (a_.i16[i] > UINT8_MAX) + ? UINT8_MAX + : ((a_.i16[i] < 0) + ? UINT8_C(0) + : HEDLEY_STATIC_CAST(uint8_t, + a_.i16[i])); + r_.u8[i + 8] = + (b_.i16[i] > UINT8_MAX) ? UINT8_MAX - : ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i])); + : ((b_.i16[i] < 0) + ? UINT8_C(0) + : HEDLEY_STATIC_CAST(uint8_t, + b_.i16[i])); } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_pause(void) { -#if defined(SIMDE_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) _mm_pause(); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_pause() (simde_mm_pause()) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_sad_epu8(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sad_epu8(a, b); #else - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { uint16_t tmp = 0; - SIMDE__VECTORIZE_REDUCTION(+ : tmp) - for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2); + SIMDE_VECTORIZE_REDUCTION(+ : tmp) + for (size_t j = 0; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2); j++) { const size_t e = j + (i * 8); - tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e]) - : (b.u8[e] - a.u8[e]); + tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) + : (b_.u8[e] - a_.u8[e]); } - r.i64[i] = tmp; + r_.i64[i] = tmp; } -#endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - simde__m128i r; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, - e3, e2, e1, e0); -#else - r.i8[0] = e0; - r.i8[1] = e1; - r.i8[2] = e2; - r.i8[3] = e3; - r.i8[4] = e4; - r.i8[5] = e5; - r.i8[6] = e6; - r.i8[7] = e7; - r.i8[8] = e8; - r.i8[9] = e9; - r.i8[10] = e10; - r.i8[11] = e11; - r.i8[12] = e12; - r.i8[13] = e13; - r.i8[14] = e14; - r.i8[15] = e15; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, + e4, e3, e2, e1, e0); +#else + simde__m128i_private r_; + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, + e10, e11, e12, e13, e14, e15); +#else + r_.i8[0] = e0; + r_.i8[1] = e1; + r_.i8[2] = e2; + r_.i8[3] = e3; + r_.i8[4] = e4; + r_.i8[5] = e5; + r_.i8[6] = e6; + r_.i8[7] = e7; + r_.i8[8] = e8; + r_.i8[9] = e9; + r_.i8[10] = e10; + r_.i8[11] = e11; + r_.i8[12] = e12; + r_.i8[13] = e13; + r_.i8[14] = e14; + r_.i8[15] = e15; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, \ + e2, e1, e0) \ + simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, \ + e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); -#elif defined(SIMDE_SSE2_NEON) - SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; - r.neon_i16 = vld1q_s16(data); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN_AS(16, int16x8_t) + int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + r_.neon_i16 = vld1q_s16(data); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); #else - r.i16[0] = e0; - r.i16[1] = e1; - r.i16[2] = e2; - r.i16[3] = e3; - r.i16[4] = e4; - r.i16[5] = e5; - r.i16[6] = e6; - r.i16[7] = e7; + r_.i16[0] = e0; + r_.i16[1] = e1; + r_.i16[2] = e2; + r_.i16[3] = e3; + r_.i16[4] = e4; + r_.i16[5] = e5; + r_.i16[6] = e6; + r_.i16[7] = e7; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi32(e3, e2, e1, e0); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_epi32(e3, e2, e1, e0); -#elif defined(SIMDE_SSE2_NEON) - SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3}; - r.neon_i32 = vld1q_s32(data); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = {e0, e1, e2, e3}; + r_.neon_i32 = vld1q_s32(data); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3); #else - r.i32[0] = e0; - r.i32[1] = e1; - r.i32[2] = e2; - r.i32[3] = e3; + r_.i32[0] = e0; + r_.i32[1] = e1; + r_.i32[2] = e2; + r_.i32[3] = e3; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_epi64(e1.n, e0.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_set_epi64(e1, e0); #else - r.i64[0] = e0.i64[0]; - r.i64[1] = e1.i64[0]; -#endif + simde__m128i_private r_; - return r; + r_.m64_private[0] = simde__m64_to_private(e0); + r_.m64_private[1] = simde__m64_to_private(e1); + + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)) + return _mm_set_epi64x(e1, e0); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_epi64x(e1, e0); -#elif defined(SIMDE_SSE2_NEON) - r = SIMDE__M128I_NEON_C(i64, - vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1))); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_make(e0, e1); #else - r.i64[0] = e0; - r.i64[1] = e1; + r_.i64[0] = e0; + r_.i64[1] = e1; #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m128i r; - - r.u8[0] = e0; - r.u8[1] = e1; - r.u8[2] = e2; - r.u8[3] = e3; - r.u8[4] = e4; - r.u8[5] = e5; - r.u8[6] = e6; - r.u8[7] = e7; - r.u8[8] = e8; - r.u8[9] = e9; - r.u8[10] = e10; - r.u8[11] = e11; - r.u8[12] = e12; - r.u8[13] = e13; - r.u8[14] = e14; - r.u8[15] = e15; - - return r; -} - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi8( + HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), + HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12), + HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), + HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8), + HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), + HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4), + HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), + HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0)); +#else + simde__m128i_private r_; + + r_.u8[0] = e0; + r_.u8[1] = e1; + r_.u8[2] = e2; + r_.u8[3] = e3; + r_.u8[4] = e4; + r_.u8[5] = e5; + r_.u8[6] = e6; + r_.u8[7] = e7; + r_.u8[8] = e8; + r_.u8[9] = e9; + r_.u8[10] = e10; + r_.u8[11] = e11; + r_.u8[12] = e12; + r_.u8[13] = e13; + r_.u8[14] = e14; + r_.u8[15] = e15; + + return simde__m128i_from_private(r_); +#endif +} + +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi16( + HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), + HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4), + HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), + HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0)); +#else + simde__m128i_private r_; - r.u16[0] = e0; - r.u16[1] = e1; - r.u16[2] = e2; - r.u16[3] = e3; - r.u16[4] = e4; - r.u16[5] = e5; - r.u16[6] = e6; - r.u16[7] = e7; + r_.u16[0] = e0; + r_.u16[1] = e1; + r_.u16[2] = e2; + r_.u16[3] = e3; + r_.u16[4] = e4; + r_.u16[5] = e5; + r_.u16[6] = e6; + r_.u16[7] = e7; - return r; + return simde__m128i_from_private(r_); +#endif } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi32(HEDLEY_STATIC_CAST(int, e3), + HEDLEY_STATIC_CAST(int, e2), + HEDLEY_STATIC_CAST(int, e1), + HEDLEY_STATIC_CAST(int, e0)); +#else + simde__m128i_private r_; - r.u32[0] = e0; - r.u32[1] = e1; - r.u32[2] = e2; - r.u32[3] = e3; + r_.u32[0] = e0; + r_.u32[1] = e1; + r_.u32[2] = e2; + r_.u32[3] = e3; - return r; + return simde__m128i_from_private(r_); +#endif } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)) + return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), + HEDLEY_STATIC_CAST(int64_t, e0)); +#else + simde__m128i_private r_; - r.u64[0] = e0; - r.u64[1] = e1; + r_.u64[0] = e0; + r_.u64[1] = e1; - return r; + return simde__m128i_from_private(r_); +#endif } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_pd(e1, e0); +#else + simde__m128d_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_pd(e1, e0); +#if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_make(e0, e1); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_make(e0, e1); #else - r.f64[0] = e0; - r.f64[1] = e1; + r_.f64[0] = e0; + r_.f64[1] = e1; #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd1(simde_float64 a) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_pd(a); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set1_pd(a); #else - r.f64[0] = a; - r.f64[1] = a; -#endif + simde__m128d_private r_; - return r; + r_.f64[0] = a; + r_.f64[1] = a; + + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_pd1(a) simde_mm_set1_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_sd(simde_float64 a) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set_sd(a); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_sd(a); +#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT32_C(0.0)), 0); #else - r.f64[0] = a; - r.u64[1] = 0; -#endif + return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); - return r; +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set_sd(a) simde_mm_set_sd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi8(int8_t a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set1_epi8(a); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_epi8(a); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i8 = vdupq_n_s8(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vdupq_n_s8(a); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_splat(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = a; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = a; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_epi8(a) simde_mm_set1_epi8(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi16(int16_t a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set1_epi16(a); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_epi16(a); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i16 = vdupq_n_s16(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vdupq_n_s16(a); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_splat(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = a; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_epi16(a) simde_mm_set1_epi16(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi32(int32_t a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set1_epi32(a); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_epi32(a); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vdupq_n_s32(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vdupq_n_s32(a); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_splat(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_epi32(a) simde_mm_set1_epi32(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64x(int64_t a) { - simde__m128i r; +#if defined(SIMDE_X86_SSE2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)) + return _mm_set1_epi64x(a); +#else + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_epi64x(a); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i64 = vmovq_n_s64(a); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vmovq_n_s64(a); +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_splat(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a; } #endif - return r; + return simde__m128i_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64(simde__m64 a) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_epi64(a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_set1_epi64(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[0]; - } + simde__m64_private a_ = simde__m64_to_private(a); + return simde_mm_set1_epi64x(a_.i64[0]); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_epi64(a) simde_mm_set1_epi64(a) #endif - return r; +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set1_epu8(uint8_t value) +{ + return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set1_epu16(uint16_t value) +{ + return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set1_epu32(uint32_t value) +{ + return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_set1_epu64(uint64_t value) +{ + return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); +} + +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set1_pd(simde_float64 a) { - simde__m128d r; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set1_pd(a); +#else + simde__m128d_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_set1_pd(a); +#if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_splat(a); #else - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.f64[i] = a; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.f64[i] = a; } #endif - return r; + return simde__m128d_from_private(r_); +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_set1_pd(a) simde_mm_set1_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, - e4, e3, e2, e1, e0); -#elif defined(SIMDE_SSE2_NEON) - int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0}; - r.neon_i8 = vld1q_s8(t); -#else - r.i8[0] = e15; - r.i8[1] = e14; - r.i8[2] = e13; - r.i8[3] = e12; - r.i8[4] = e11; - r.i8[5] = e10; - r.i8[6] = e9; - r.i8[7] = e8; - r.i8[8] = e7; - r.i8[9] = e6; - r.i8[10] = e5; - r.i8[11] = e4; - r.i8[12] = e3; - r.i8[13] = e2; - r.i8[14] = e1; - r.i8[15] = e0; +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, + e4, e3, e2, e1, e0); +#else + return simde_mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, + e11, e12, e13, e14, e15); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, \ + e3, e2, e1, e0) \ + simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, \ + e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); -#elif defined(SIMDE_SSE2_NEON) - int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0}; - r.neon_i16 = vld1q_s16(t); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); #else - r.i16[0] = e7; - r.i16[1] = e6; - r.i16[2] = e5; - r.i16[3] = e4; - r.i16[4] = e3; - r.i16[5] = e2; - r.i16[6] = e1; - r.i16[7] = e0; + return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setr_epi32(e3, e2, e1, e0); -#elif defined(SIMDE_SSE2_NEON) - int32_t t[] = {e3, e2, e1, e0}; - r.neon_i32 = vld1q_s32(t); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setr_epi32(e3, e2, e1, e0); #else - r.i32[0] = e3; - r.i32[1] = e2; - r.i32[2] = e1; - r.i32[3] = e0; + return simde_mm_set_epi32(e0, e1, e2, e3); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0) { - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setr_epi64(e1.n, e0.n); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_setr_epi64(e1, e0); #else - r.i64[0] = e1.i64[0]; - r.i64[1] = e0.i64[0]; + return simde_mm_set_epi64(e0, e1); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setr_pd(e1, e0); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setr_pd(e1, e0); #else - r.f64[0] = e1; - r.f64[1] = e0; + return simde_mm_set_pd(e0, e1); #endif - - return r; } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_setzero_pd(void) { - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setzero_pd(); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_setzero_pd(); #else - r.u64[0] = 0; - r.u64[1] = 0; + simde__m128d_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = 0; + } + + return simde__m128d_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_setzero_pd() simde_mm_setzero_pd() #endif - return r; +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d simde_mm_undefined_pd(void) +{ + simde__m128d_private r_; + +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r_.n = _mm_undefined_pd(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r_ = simde__m128d_to_private(simde_mm_setzero_pd()); +#endif + + return simde__m128d_from_private(r_); } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_undefined_pd() simde_mm_undefined_pd() +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_setzero_si128(void) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_undefined_si128(void) { - simde__m128i r; + simde__m128i_private r_; -#if defined(SIMDE_SSE2_NATIVE) - r.n = _mm_setzero_si128(); -#elif defined(SIMDE_SSE2_NEON) - r.neon_i32 = vdupq_n_s32(0); -#else - r.u64[0] = 0; - r.u64[1] = 0; +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) + r_.n = _mm_undefined_si128(); +#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) + r_ = simde__m128i_to_private(simde_mm_setzero_si128()); #endif - return r; + return simde__m128i_from_private(r_); +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_undefined_si128() (simde_mm_undefined_si128()) +#endif + +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d simde_x_mm_setone_pd(void) +{ + return simde_mm_castps_pd(simde_x_mm_setone_ps()); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_setone_si128(void) +{ + return simde_mm_castps_si128(simde_x_mm_setone_ps()); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3]; + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3]; } - return r; + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_shuffle_epi32(a, imm8) \ - SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8))) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shuffle_epi32(a, imm8) \ - ({ \ - const simde__m128i simde__tmp_a_ = a; \ - (simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \ - 32, 16, (simde__tmp_a_).i32, \ - (simde__tmp_a_).i32, ((imm8)) & 3, \ - ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3)}; \ - }) -#endif - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shuffle_epi32(a, imm8) \ + (__extension__({ \ + const simde__m128i_private simde__tmp_a_ = \ + simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private){ \ + .i32 = SIMDE_SHUFFLE_VECTOR_( \ + 32, 16, (simde__tmp_a_).i32, \ + (simde__tmp_a_).i32, ((imm8)) & 3, \ + ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \ + ((imm8) >> 6) & 3)}); \ + })) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 3) { - simde__m128d r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); - r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1]; - r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1]; + r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1]; + r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1]; - return r; + return simde__m128d_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) -#define simde_mm_shuffle_pd(a, b, imm8) \ - SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8))) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shuffle_pd(a, b, imm8) \ - ({ \ - (simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \ - 64, 16, (a).f64, (b).f64, \ - (((imm8)) & 1), \ - (((imm8) >> 1) & 1) + 2)}; \ - }) +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) +#define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8)) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shuffle_pd(a, b, imm8) \ + (__extension__({ \ + simde__m128d_from_private((simde__m128d_private){ \ + .f64 = SIMDE_SHUFFLE_VECTOR_( \ + 64, 16, simde__m128d_to_private(a).f64, \ + simde__m128d_to_private(b).f64, \ + (((imm8)) & 1), (((imm8) >> 1) & 1) + 2)}); \ + })) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - r.i64[0] = a.i64[0]; - for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2); + i++) { + r_.i16[i] = a_.i16[i]; + } + for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2); + i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; } - return r; -} -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_shufflehi_epi16(a, imm8) \ - SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8))) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shufflehi_epi16(a, imm8) \ - ({ \ - const simde__m128i simde__tmp_a_ = a; \ - (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ - 16, 16, (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, 0, 1, 2, 3, \ - (((imm8)) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4)}; \ - }) -#endif - -SIMDE__FUNCTION_ATTRIBUTES + return simde__m128i_from_private(r_); +} +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shufflehi_epi16(a, imm8) \ + (__extension__({ \ + const simde__m128i_private simde__tmp_a_ = \ + simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private){ \ + .i16 = SIMDE_SHUFFLE_VECTOR_( \ + 16, 16, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, 0, 1, 2, 3, \ + (((imm8)) & 3) + 4, (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4)}); \ + })) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) { - r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)]; + for (size_t i = 0; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2); + i++) { + r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)]; + } + SIMDE_VECTORIZE + for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2); + i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i]; } - r.i64[1] = a.i64[1]; - return r; + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_shufflelo_epi16(a, imm8) \ - SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8))) -#elif defined(SIMDE__SHUFFLE_VECTOR) -#define simde_mm_shufflelo_epi16(a, imm8) \ - ({ \ - const simde__m128i simde__tmp_a_ = a; \ - (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ - 16, 16, (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, (((imm8)) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), 4, 5, 6, 7)}; \ - }) -#endif - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) +#elif defined(SIMDE_SHUFFLE_VECTOR_) +#define simde_mm_shufflelo_epi16(a, imm8) \ + (__extension__({ \ + const simde__m128i_private simde__tmp_a_ = \ + simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private){ \ + .i16 = SIMDE_SHUFFLE_VECTOR_( \ + 16, 16, (simde__tmp_a_).i16, \ + (simde__tmp_a_).i16, (((imm8)) & 3), \ + (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3), 4, 5, 6, 7)}); \ + })) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sll_epi16(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 15) + if (count_.u64[0] > 15) return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = a.u16[i] << s; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = (a_.u16 << count_.u64[0]); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, + (a_.u16[i] << count_.u64[0])); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sll_epi32(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 31) + if (count_.u64[0] > 31) return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] << s; +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = (a_.u32 << count_.u64[0]); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, + (a_.u32[i] << count_.u64[0])); } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sll_epi64(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 63) + if (count_.u64[0] > 63) return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] << s; + const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]); +#if !defined(SIMDE_BUG_GCC_94488) + SIMDE_VECTORIZE +#endif + for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) { + r_.u64[i] = a_.u64[i] << s; } - return r; + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_pd(simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_sqrt_pd(a.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sqrt_pd(a); #else - simde__m128d r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = sqrt(a.f64[i]); +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vsqrtq_f64(a_.neon_f64); +#elif defined(simde_math_sqrt) + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = simde_math_sqrt(a_.f64[i]); } +#else + HEDLEY_UNREACHABLE(); +#endif - return r; + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sqrt_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_sqrt_pd(b)); #else - simde__m128d r; - r.f64[0] = sqrt(b.f64[0]); - r.f64[1] = a.f64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(simde_math_sqrt) + r_.f64[0] = simde_math_sqrt(b_.f64[0]); + r_.f64[1] = a_.f64[1]; +#else + HEDLEY_UNREACHABLE(); +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_srl_epi16(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 15) - return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); + const int cnt = HEDLEY_STATIC_CAST( + int, (count_.i64[0] > 16 ? 16 : count_.i64[0])); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) { - r.u16[i] = a.u16[i] >> s; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vshlq_u16(a_.neon_u16, + vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) { + r_.u16[i] = a_.u16[i] >> cnt; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_srl_epi32(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 31) - return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); + const int cnt = HEDLEY_STATIC_CAST( + int, (count_.i64[0] > 32 ? 32 : count_.i64[0])); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) { - r.u32[i] = a.u32[i] >> s; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vshlq_u32(a_.neon_u32, + vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] >> cnt; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_srl_epi64(a, count); #else - simde__m128i r; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (count.u64[0] > 31) - return simde_mm_setzero_si128(); - const int s = (int)(count.u64[0]); + const int cnt = HEDLEY_STATIC_CAST( + int, (count_.i64[0] > 64 ? 64 : count_.i64[0])); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) { - r.u64[i] = a.u64[i] >> s; +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vshlq_u64(a_.neon_u64, + vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); +#else +#if !defined(SIMDE_BUG_GCC_94488) + SIMDE_VECTORIZE +#endif + for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) { + r_.u64[i] = a_.u64[i] >> cnt; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srai_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i r; + /* MSVC requires a range of (0, 255). */ + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - const uint16_t m = - (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8)); + const int cnt = (imm8 & ~15) ? 15 : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) { - const uint16_t is_neg = ((uint16_t)( - ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1)))); - r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg); +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(-cnt)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] >> cnt; } +#endif - return r; + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_srai_epi16(a, imm8) \ - SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8))); +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8)) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8) +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_srai_epi32(simde__m128i a, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i r; + /* MSVC requires a range of (0, 255). */ + simde__m128i_private r_, a_ = simde__m128i_to_private(a); - const uint32_t m = - (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8)); - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) { - uint32_t is_neg = ((uint32_t)( - ((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1)))); - r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg); + const int cnt = (imm8 & ~31) ? 31 : imm8; + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt)); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] >> cnt; } +#endif - return r; + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_srai_epi32(a, imm8) \ - SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8))) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_srai_epi32(a, imm8) \ - SIMDE__M128I_NEON_C( \ - i32, \ - ((imm8) <= 0) \ - ? (a.neon_i32) \ - : (((imm8) > 31) \ - ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \ - 16)) \ - : (vshrq_n_s32(a.neon_i32, (imm8))))) -#endif - -SIMDE__FUNCTION_ATTRIBUTES +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8)) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sra_epi16(a, count); #else - simde__m128i r; - int cnt = (int)count.i64[0]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (cnt > 15 || cnt < 0) { - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); - i++) { - r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000; - } - } else { - const uint16_t m = (uint16_t)( - (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt)); - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); - i++) { - const uint16_t is_neg = a.i16[i] < 0; - r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg); - } + const int cnt = HEDLEY_STATIC_CAST( + int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshlq_s16(a_.neon_i16, + vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] >> cnt; } +#endif - return r; + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count)) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count) { -#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) - return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) + return _mm_sra_epi32(a, count); #else - simde__m128i r; - const uint64_t cnt = count.u64[0]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + count_ = simde__m128i_to_private(count); - if (cnt > 31) { - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); - i++) { - r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0; - } - } else if (cnt == 0) { - memcpy(&r, &a, sizeof(r)); - } else { - const uint32_t m = (uint32_t)( - (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt)); - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); - i++) { - const uint32_t is_neg = a.i32[i] < 0; - r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg); - } + const int cnt = count_.u64[0] > 31 + ? 31 + : HEDLEY_STATIC_CAST(int, count_.u64[0]); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshlq_s32(a_.neon_i32, + vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] >> cnt; } +#endif - return r; + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count))) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 - : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = a.i16[i] << s; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 << (imm8 & 0xff); +#else + const int s = + (imm8 > + HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) + ? 0 + : imm8; + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s); } - return r; +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8)); -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_slli_epi16(a, imm8) \ - SIMDE__M128I_NEON_C( \ - i16, ((imm8) <= 0) \ - ? ((a).neon_i16) \ - : (((imm8) > 31) ? (vdupq_n_s16(0)) \ - : (vshlq_n_s16((a).neon_i16, \ - (imm8))))) +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_slli_epi16(a, imm8) \ + simde__m128i_from_neon_u16( \ + vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 - : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] << s; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 << imm8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] << (imm8 & 0xff); } - return r; +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8)); -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_slli_epi32(a, imm8) \ - SIMDE__M128I_NEON_C( \ - i32, ((imm8) <= 0) \ - ? ((a).neon_i32) \ - : (((imm8) > 31) ? (vdupq_n_s32(0)) \ - : (vshlq_n_s32((a).neon_i32, \ - (imm8))))) +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_slli_epi32(a, imm8) \ + simde__m128i_from_neon_u32( \ + vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0 - : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] << s; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i64 = a_.i64 << imm8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a_.i64[i] << (imm8 & 0xff); } - return r; +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8)); +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_slli_epi64(a, imm8) \ + simde__m128i_from_neon_u64( \ + vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0 - : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.u16[i] = a.u16[i] >> s; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 >> imm8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.u16[i] = a_.u16[i] >> (imm8 & 0xff); } - return r; +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8)); -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_srli_epi16(a, imm8) \ - SIMDE__M128I_NEON_C( \ - u16, ((imm8) <= 0) \ - ? ((a).neon_u16) \ - : (((imm8) > 31) ? (vdupq_n_u16(0)) \ - : (vshrq_n_u16((a).neon_u16, \ - (imm8))))) +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_srli_epi16(a, imm8) \ + simde__m128i_from_neon_u16( \ + vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8)) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0 - : imm8; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.u32[i] = a.u32[i] >> s; + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> (imm8 & 0xff); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.u32[i] = a_.u32[i] >> (imm8 & 0xff); } - return r; +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8)) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_srli_epi32(a, imm8) \ - SIMDE__M128I_NEON_C( \ - u32, ((imm8) <= 0) \ - ? ((a).neon_u32) \ - : (((imm8) > 31) ? (vdupq_n_u32(0)) \ - : (vshrq_n_u32((a).neon_u32, \ - (imm8))))) +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) +#define simde_mm_srli_epi32(a, imm8) \ + simde__m128i_from_neon_u32( \ + vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8)) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8) + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m128i r; - const unsigned char s = imm8 & 255; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - if (s > 63) { - r.u64[i] = 0; - } else { - r.u64[i] = a.u64[i] >> s; - } + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + + if (HEDLEY_UNLIKELY((imm8 & 63) != imm8)) + return simde_mm_setzero_si128(); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); +#else +#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) + r_.u64 = a_.u64 >> imm8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.u64[i] = a_.u64[i] >> imm8; } - return r; +#endif +#endif + + return simde__m128i_from_private(r_); } -#if defined(SIMDE_SSE2_NATIVE) -#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8)) -#elif defined(SIMDE_SSE2_NEON) -#define simde_mm_srli_epi64(a, imm8) \ - SIMDE__M128I_NEON_C( \ - u64, \ - (((imm8)&255) < 0 || ((imm8)&255) > 63) \ - ? (vdupq_n_u64(0)) \ - : ((((imm8)&255) == 0) \ - ? (a.neon_u64) \ - : (vshrq_n_u64((a).neon_u64, (imm8)&255)))) +#if defined(SIMDE_X86_SSE2_NATIVE) +#define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) +#define simde_mm_srli_epi64(a, imm8) \ + ((imm8 == 0) ? (a) \ + : (simde__m128i_from_neon_u64(vshrq_n_u64( \ + simde__m128i_to_neon_u64(a), imm8)))) +#endif +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8) #endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - _mm_store_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_store_pd(mem_addr, a); +#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(mem_addr, &a, sizeof(a)); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_store_pd(mem_addr, a) \ + simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - _mm_store1_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_store1_pd(mem_addr, a); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - mem_addr[0] = a.f64[0]; - mem_addr[1] = a.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a); + + mem_addr[0] = a_.f64[0]; + mem_addr[1] = a_.f64[0]; #endif } -#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a) +#define simde_mm_store_pd1(mem_addr, a) \ + simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_store1_pd(mem_addr, a) \ + simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#define _mm_store_pd1(mem_addr, a) \ + simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_store_sd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_store_sd(mem_addr, a); +#else + simde__m128d_private a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0); + simde_memcpy(mem_addr, &v, sizeof(simde_float64)); #else - memcpy(mem_addr, &a, sizeof(a.f64[0])); + simde_float64 v = a_.f64[0]; + simde_memcpy(mem_addr, &v, sizeof(simde_float64)); +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_store_sd(mem_addr, a) \ + simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_store_si128(&mem_addr->n, a.n); -#elif defined(SIMDE_SSE2_NEON) - vst1q_s32((int32_t *)mem_addr, a.neon_i32); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_store_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a); +#else + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), a_.neon_i32); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_)); +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_storeh_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storeh_pd(mem_addr, a); +#else + simde__m128d_private a_ = simde__m128d_to_private(a); + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); #else - *mem_addr = a.f64[1]; + *mem_addr = a_.f64[1]; +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storeh_pd(mem_addr, a) \ + simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_storel_epi64(&(mem_addr->n), a.n); -#elif defined(SIMDE_SSE2_NEON) - mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a); +#else + simde__m128i_private a_ = simde__m128i_to_private(a); + int64_t tmp; + + /* memcpy to prevent aliasing, tmp because we can't take the + * address of a vector element. */ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + tmp = vgetq_lane_s64(a_.neon_i64, 0); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) +#if defined(SIMDE_BUG_GCC_95227) + (void)a_; +#endif + tmp = vec_extract(a_.altivec_i64, 0); #else - mem_addr->i64[0] = a.i64[0]; + tmp = a_.i64[0]; +#endif + + simde_memcpy(mem_addr, &tmp, sizeof(tmp)); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_storel_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storel_pd(mem_addr, a); #else - *mem_addr = a.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a); + + *mem_addr = a_.f64[0]; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storel_pd(mem_addr, a) \ + simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a) { simde_assert_aligned(16, mem_addr); -#if defined(SIMDE_SSE2_NATIVE) - _mm_storer_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storer_pd(mem_addr, a); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - mem_addr[0] = a.f64[1]; - mem_addr[1] = a.f64[0]; + simde__m128d_private a_ = simde__m128d_to_private(a); + + mem_addr[0] = a_.f64[1]; + mem_addr[1] = a_.f64[0]; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storer_pd(mem_addr, a) \ + simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_storeu_pd(mem_addr, a.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storeu_pd(mem_addr, a); #else - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(mem_addr, &a, sizeof(a)); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storeu_pd(mem_addr, a) \ + simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_storeu_si128(&mem_addr->n, a.n); -#elif defined(SIMDE_SSE2_NEON) - int32_t v[4]; - vst1q_s32(v, a.neon_i32); - memcpy(mem_addr, v, sizeof(v)); +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a); +#else + simde__m128i_private a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), a_.neon_i32); #else - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(mem_addr, &a_, sizeof(a_)); +#endif #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_stream_pd(mem_addr, a.n); + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_stream_pd(mem_addr, a); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(mem_addr, &a, sizeof(a)); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_stream_pd(mem_addr, a) \ + simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a) { -#if defined(SIMDE_SSE2_NATIVE) - _mm_stream_si128(&mem_addr->n, a.n); + simde_assert_aligned(16, mem_addr); + +#if defined(SIMDE_X86_SSE2_NATIVE) + _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a); #else - SIMDE__ASSUME_ALIGNED(mem_addr, 16); - memcpy(mem_addr, &a, sizeof(a)); + simde_memcpy(mem_addr, &a, sizeof(a)); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_si32(int32_t *mem_addr, int32_t a) { -#if defined(SIMDE_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_si32(mem_addr, a); #else *mem_addr = a; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_stream_si64(int64_t *mem_addr, int64_t a) { -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) -#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0) - *mem_addr = a; -#elif defined(__GNUC__) - _mm_stream_si64((long long *)mem_addr, a); -#else - _mm_stream_si64(mem_addr, a); -#endif -#else *mem_addr = a; -#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_stream_si64(mem_addr, a) \ + simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST( \ + int64_t *, __int64 *, mem_addr), \ + a) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_epi8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) { - r.i8[i] = a.i8[i] - b.i8[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = a_.i8 - b_.i8; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) { + r_.i8[i] = a_.i8[i] - b_.i8[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_epi16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) { - r.i16[i] = a.i16[i] - b.i16[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = a_.i16 - b_.i16; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) { + r_.i16[i] = a_.i16[i] - b_.i16[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_epi32(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] - b.i32[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 - b_.i32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { + r_.i32[i] = a_.i32[i] - b_.i32[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_epi64(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] - b.i64[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 - b_.i64; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = a_.i64[i] - b_.i64[i]; + } +#endif + + return simde__m128i_from_private(r_); +#endif +} +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_x_mm_sub_epu32(simde__m128i a, simde__m128i b) +{ + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = a_.u32 - b_.u32; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) { + r_.u32[i] = a_.u32[i] - b_.u32[i]; } - return r; #endif + + return simde__m128i_from_private(r_); } -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) { - r.f64[i] = a.f64[i] - b.f64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 - b_.f64; +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { + r_.f64[i] = a_.f64[i] - b_.f64[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_sub_sd(a, b); +#elif defined(SIMDE_ASSUME_VECTORIZATION) + return simde_mm_move_sd(a, simde_mm_sub_pd(a, b)); #else - simde__m128d r; - r.f64[0] = a.f64[0] - b.f64[0]; - r.f64[1] = a.f64[1]; - return r; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + + r_.f64[0] = a_.f64[0] - b_.f64[0]; + r_.f64[1] = a_.f64[1]; + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M64_C(_mm_sub_si64(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + return _mm_sub_si64(a, b); #else - simde__m64 r; - r.i64[0] = a.i64[0] - b.i64[0]; - return r; + simde__m64_private r_, a_ = simde__m64_to_private(a), + b_ = simde__m64_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 - b_.i64; +#else + r_.i64[0] = a_.i64[0] - b_.i64[0]; +#endif + + return simde__m64_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_subs_epi8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) { - if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) { - r.i8[i] = INT8_MIN; - } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) { - r.i8[i] = INT8_MAX; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) { + if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) { + r_.i8[i] = INT8_MIN; + } else if ((b_.i8[i]) < 0 && + (a_.i8[i]) > INT8_MAX + (b_.i8[i])) { + r_.i8[i] = INT8_MAX; } else { - r.i8[i] = (a.i8[i]) - (b.i8[i]); + r_.i8[i] = (a_.i8[i]) - (b_.i8[i]); } } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_subs_epi16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) { - if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) { - r.i16[i] = INT16_MIN; - } else if ((b.i16[i]) < 0 && - (a.i16[i]) > INT16_MAX + (b.i16[i])) { - r.i16[i] = INT16_MAX; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) { + if (((b_.i16[i]) > 0 && + (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) { + r_.i16[i] = INT16_MIN; + } else if ((b_.i16[i]) < 0 && + (a_.i16[i]) > INT16_MAX + (b_.i16[i])) { + r_.i16[i] = INT16_MAX; } else { - r.i16[i] = (a.i16[i]) - (b.i16[i]); + r_.i16[i] = (a_.i16[i]) - (b_.i16[i]); } } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_subs_epu8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) { - const int32_t x = a.u8[i] - b.u8[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) { + const int32_t x = a_.u8[i] - b_.u8[i]; if (x < 0) { - r.u8[i] = 0; + r_.u8[i] = 0; } else if (x > UINT8_MAX) { - r.u8[i] = UINT8_MAX; + r_.u8[i] = UINT8_MAX; } else { - r.u8[i] = (uint8_t)x; + r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); } } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_subs_epu16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) { - const int32_t x = a.u16[i] - b.u16[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) { + const int32_t x = a_.u16[i] - b_.u16[i]; if (x < 0) { - r.u16[i] = 0; + r_.u16[i] = 0; } else if (x > UINT16_MAX) { - r.u16[i] = UINT16_MAX; + r_.u16[i] = UINT16_MAX; } else { - r.u16[i] = (uint16_t)x; + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); } } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomieq_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomieq_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] == b.f64[0]; + r = a_.f64[0] == b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f64[0] == b_.f64[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomige_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomige_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] >= b.f64[0]; + r = a_.f64[0] >= b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f64[0] >= b_.f64[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomigt_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomigt_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] > b.f64[0]; + r = a_.f64[0] > b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f64[0] > b_.f64[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomile_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomile_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] <= b.f64[0]; + r = a_.f64[0] <= b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f64[0] <= b_.f64[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomilt_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomilt_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] < b.f64[0]; + r = a_.f64[0] < b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); +#else + r = a_.f64[0] < b_.f64[0]; +#endif + return r; #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return _mm_ucomineq_sd(a.n, b.n); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_ucomineq_sd(a, b); #else + simde__m128d_private a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + int r; + +#if defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); - int r = a.f64[0] != b.f64[0]; + r = a_.f64[0] != b_.f64[0]; if (HEDLEY_LIKELY(x == 0)) fesetenv(&envp); - return r; -#endif -} - -SIMDE__FUNCTION_ATTRIBUTES -simde__m128d simde_mm_undefined_pd(void) -{ - simde__m128d r; - -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r.n = _mm_undefined_pd(); #else - r = simde_mm_setzero_pd(); + r = a_.f64[0] != b_.f64[0]; #endif return r; +#endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES -simde__m128i simde_mm_undefined_si128(void) -{ - simde__m128i r; - -#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r.n = _mm_undefined_si128(); -#else - r = simde_mm_setzero_si128(); +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif - return r; -} +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_lfence(void) { -#if defined(SIMDE_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) _mm_lfence(); #else simde_mm_sfence(); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_lfence() simde_mm_lfence() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES void simde_mm_mfence(void) { -#if defined(SIMDE_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) _mm_mfence(); #else simde_mm_sfence(); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_mfence() simde_mm_mfence() +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - return SIMDE__M128I_NEON_C(i8, - vcombine_s8(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpackhi_epi8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) { - r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; - r.i8[(i * 2) + 1] = - b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, + 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) { + r_.i8[(i * 2)] = + a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; + r_.i8[(i * 2) + 1] = + b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int16x4_t a1 = vget_high_s16(a.neon_i16); - int16x4_t b1 = vget_high_s16(b.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - return SIMDE__M128I_NEON_C(i16, - vcombine_s16(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpackhi_epi16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) { - r.i16[(i * 2)] = - a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; - r.i16[(i * 2) + 1] = - b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int16x4_t a1 = vget_high_s16(a_.neon_i16); + int16x4_t b1 = vget_high_s16(b_.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, + 14, 7, 15); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) { + r_.i16[(i * 2)] = + a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; + r_.i16[(i * 2) + 1] = + b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int32x2_t a1 = vget_high_s32(a.neon_i32); - int32x2_t b1 = vget_high_s32(b.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - return SIMDE__M128I_NEON_C(i32, - vcombine_s32(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpackhi_epi32(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) { - r.i32[(i * 2)] = - a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; - r.i32[(i * 2) + 1] = - b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int32x2_t a1 = vget_high_s32(a_.neon_i32); + int32x2_t b1 = vget_high_s32(b_.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) { + r_.i32[(i * 2)] = + a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; + r_.i32[(i * 2) + 1] = + b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpackhi_epi64(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) { - r.i64[(i * 2)] = - a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; - r.i64[(i * 2) + 1] = - b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) { + r_.i64[(i * 2)] = + a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; + r_.i64[(i * 2) + 1] = + b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpackhi_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) { - r.f64[(i * 2)] = - a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; - r.f64[(i * 2) + 1] = - b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) { + r_.f64[(i * 2)] = + a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; + r_.f64[(i * 2) + 1] = + b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - return SIMDE__M128I_NEON_C(i8, - vcombine_s8(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpacklo_epi8(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) { - r.i8[(i * 2)] = a.i8[i]; - r.i8[(i * 2) + 1] = b.i8[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16)); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16)); + int8x8x2_t result = vzip_s8(a1, b1); + r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) { + r_.i8[(i * 2)] = a_.i8[i]; + r_.i8[(i * 2) + 1] = b_.i8[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int16x4_t a1 = vget_low_s16(a.neon_i16); - int16x4_t b1 = vget_low_s16(b.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - return SIMDE__M128I_NEON_C(i16, - vcombine_s16(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpacklo_epi16(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) { - r.i16[(i * 2)] = a.i16[i]; - r.i16[(i * 2) + 1] = b.i16[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int16x4_t a1 = vget_low_s16(a_.neon_i16); + int16x4_t b1 = vget_low_s16(b_.neon_i16); + int16x4x2_t result = vzip_s16(a1, b1); + r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, + 10, 3, 11); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) { + r_.i16[(i * 2)] = a_.i16[i]; + r_.i16[(i * 2) + 1] = b_.i16[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - int32x2_t a1 = vget_low_s32(a.neon_i32); - int32x2_t b1 = vget_low_s32(b.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - return SIMDE__M128I_NEON_C(i32, - vcombine_s32(result.val[0], result.val[1])); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpacklo_epi32(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) { - r.i32[(i * 2)] = a.i32[i]; - r.i32[(i * 2) + 1] = b.i32[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + int32x2_t a1 = vget_low_s32(a_.neon_i32); + int32x2_t b1 = vget_low_s32(b_.neon_i32); + int32x2x2_t result = vzip_s32(a1, b1); + r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); +#elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) { + r_.i32[(i * 2)] = a_.i32[i]; + r_.i32[(i * 2) + 1] = b_.i32[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpacklo_epi64(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) { - r.i64[(i * 2)] = a.i64[i]; - r.i64[(i * 2) + 1] = b.i64[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) { + r_.i64[(i * 2)] = a_.i64[i]; + r_.i64[(i * 2) + 1] = b_.i64[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_unpacklo_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) { - r.f64[(i * 2)] = a.f64[i]; - r.f64[(i * 2) + 1] = b.f64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) { + r_.f64[(i * 2)] = a_.f64[i]; + r_.f64[(i * 2) + 1] = b_.f64[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_xor_pd(a, b); #else - simde__m128d r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) { - r.i64[i] = a.i64[i] ^ b.i64[i]; + simde__m128d_private r_, a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b); + +#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f ^ b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; } - return r; +#endif + + return simde__m128d_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b) { -#if defined(SIMDE_SSE2_NATIVE) - return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n)); -#elif defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32)); +#if defined(SIMDE_X86_SSE2_NATIVE) + return _mm_xor_si128(a, b); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = a.i32[i] ^ b.i32[i]; + simde__m128i_private r_, a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); +#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) + r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f ^ b_.i32f; +#else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; } - return r; +#endif + + return simde__m128i_from_private(r_); #endif } +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b) +#endif -SIMDE__FUNCTION_ATTRIBUTES +SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_not_si128(simde__m128i a) { -#if defined(SIMDE_SSE2_NEON) - return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32)); + simde__m128i_private r_, a_ = simde__m128i_to_private(a); + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vmvnq_s32(a_.neon_i32); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = ~(a_.i32f); #else - simde__m128i r; - SIMDE__VECTORIZE - for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) { - r.i32[i] = ~(a.i32[i]); + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) { + r_.i32f[i] = ~(a_.i32f[i]); } - return r; #endif + + return simde__m128i_from_private(r_); } -SIMDE__END_DECLS +#define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y)) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y) +#endif + +SIMDE_END_DECLS_ + +HEDLEY_DIAGNOSTIC_POP -#endif /* !defined(SIMDE__SSE2_H) */ +#endif /* !defined(SIMDE_X86_SSE2_H) */