gi_common.h 26.5 KB
Newer Older
1 2 3 4
#pragma once

#include "math.h"
#include "stdint.h"
5
#include "string.h"
6 7 8 9 10 11

#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#else
#if defined(__arm__) || defined(__aarch64__)
12
#include "src/arm_common/simd_macro/marm_neon.h"
13 14 15 16
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
17 18 19
#endif
#endif

20 21 22 23
#if defined(__riscv_vector)
#include <riscv_vector.h>
#endif

24
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
25 26
#define GI_TARGET_X86
#endif
27 28 29

#if defined(__arm__) || defined(__aarch64__)
#define GI_TARGET_ARM
30 31 32 33
#endif

#ifdef _WIN32
//! GI stand for general intrinsic
34
#define _GI_ALIGN_16                           __declspec(align(16))
35 36
#define GI_DECLSPEC_ALIGN(variable, alignment) DECLSPEC_ALIGN(alignment) variable
#else
37
#define _GI_ALIGN_16 __attribute__((aligned(16)))
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
#define GI_DECLSPEC_ALIGN(variable, alignment) \
    variable __attribute__((aligned(alignment)))
#endif

#if defined(_MSC_VER)
#define GI_FORCEINLINE __forceinline
#else
#define GI_FORCEINLINE __attribute__((always_inline)) inline
#endif

#if defined(_MSC_VER)
#define GI_INTERNAL_DATA extern "C"
#else
#define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden")))
#endif

54
#if defined(GI_TARGET_ARM) && defined(__ARM_NEON)
55 56 57
#define GI_NEON_INTRINSICS
#if defined(__aarch64__)
#define GI_NEON64_INTRINSICS
58
#define GI_NEON32_INTRINSICS
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
#else
#define GI_NEON32_INTRINSICS
#endif
#elif defined(GI_TARGET_X86)
//#if defined(__FMA__)
//#define GI_FMA_INTRINSICS
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX2__)
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX__)
//#define GI_AVX_INTRINSICS
#if defined(__SSE4_2__)
#define GI_SSE42_INTRINSICS
#define GI_SSE2_INTRINSICS
#elif defined(__SSE2__)
#define GI_SSE2_INTRINSICS
#endif
#endif
79 80 81
#if defined(__riscv_vector)
#define GI_RVV_INTRINSICS
#endif
82

83 84 85 86 87 88 89 90 91
#if defined(GI_TEST_NAIVE)
#undef GI_NEON_INTRINSICS
#undef GI_NEON64_INTRINSICS
#undef GI_NEON32_INTRINSICS
#undef GI_FMA_INTRINSICS
#undef GI_AVX2_INTRINSICS
#undef GI_AVX_INTRINSICS
#undef GI_SSE42_INTRINSICS
#undef GI_SSE2_INTRINSICS
92
#undef GI_RVV_INTRINSICS
93 94 95 96 97 98 99 100 101 102 103 104 105
#endif

//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
        defined(GI_FMA_INTRINSICS)
//! if neon and sse the simd lenght is 128
#define GI_SIMD_LEN      256
#define GI_SIMD_LEN_BYTE 32
#elif defined(GI_NEON_INTRINSICS) || defined(GI_SSE2_INTRINSICS) || \
        defined(GI_SSE42_INTRINSICS)
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
106 107 108 109
#elif defined(GI_RVV_INTRINSICS)
//! TODO: make gi algo usable for other GI_SIMD_LEN/GI_SIMD_LEN_BYTE
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#else
//! if no simd hardware support, the simd is implemented by C, default set to
//! 128
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
#endif

#define gi_trap() __builtin_trap()

//! for ci test now
enum GiSimdType {
    GI_UNKNOWN,
    GI_NAIVE,
    GI_AVX,
    GI_SSE42,
    GI_SSE2,
    GI_NEON,
127
    GI_RVV,
128 129
};

130 131
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
        defined(GI_FMA_INTRINSICS)
132
#define __gi_simd_type GI_AVX
133 134 135 136 137 138
typedef __m256 GI_FLOAT32_t;
typedef __m256i GI_UINT8_t;
typedef __m256i GI_INT8_t;
typedef __m256i GI_INT16_t;
typedef __m256i GI_INT32_t;
typedef __m256i GI_UINT32_t;
139
#elif defined(GI_NEON_INTRINSICS)
140
#define __gi_simd_type GI_NEON
141 142 143 144 145 146
typedef float32x4_t GI_FLOAT32_t;
typedef uint8x16_t GI_UINT8_t;
typedef int8x16_t GI_INT8_t;
typedef int16x8_t GI_INT16_t;
typedef int32x4_t GI_INT32_t;
typedef uint32x4_t GI_UINT32_t;
147
typedef float32x4x2_t GI_FLOAT32_V2_t;
148
typedef float32x4x3_t GI_FLOAT32_V3_t;
149 150 151 152 153
typedef float32x4x4_t GI_FLOAT32_V4_t;
typedef int32x4x2_t GI_INT32_V2_t;
typedef int32x4x4_t GI_INT32_V4_t;
typedef int16x8x2_t GI_INT16_V2_t;
typedef int8x16x2_t GI_INT8_V2_t;
154 155
typedef int8x16x3_t GI_INT8_V3_t;
typedef int8x16x4_t GI_INT8_V4_t;
156
typedef int64x2_t GI_INT64_t;
157
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
158 159 160 161 162 163 164 165 166 167

#if defined(GI_SSE42_INTRINSICS)
#define __gi_simd_type GI_SSE42
#elif defined(GI_SSE2_INTRINSICS)
#define __gi_simd_type GI_SSE2
#else
#define __gi_simd_type GI_UNKNOWN
#error "code issue happened!!"
#endif

168 169 170 171 172 173
typedef __m128 GI_FLOAT32_t;
typedef __m128i GI_UINT8_t;
typedef __m128i GI_INT8_t;
typedef __m128i GI_INT16_t;
typedef __m128i GI_INT32_t;
typedef __m128i GI_UINT32_t;
174
typedef __m128i GI_INT64_t;
175
#define _SWAP_HI_LOW32                    (2 | (3 << 2) | (0 << 4) | (1 << 6))
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4))
#define _M64(out, inp)                    _mm_storel_epi64((__m128i*)&(out), inp)
#define _pM128i(a)                        _mm_loadl_epi64((__m128i*)&(a))
#define _pM128(a)                         _mm_castsi128_ps(_pM128i(a))
#define _M128i(a)                         _mm_castps_si128(a)
#define _M128(a)                          _mm_castsi128_ps(a)
#if defined(__x86_64__)
#define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64(_M128i(inp));
#else
#define _M64f(out, inp) _mm_storel_epi64((__m128i*)&(out), _M128i(inp))
#endif
#define _SSE_SWITCH16(NAME, a, b, LANE) \
    switch (LANE) {                     \
        case 0:                         \
            return NAME(a b, 0);        \
        case 1:                         \
            return NAME(a b, 1);        \
        case 2:                         \
            return NAME(a b, 2);        \
        case 3:                         \
            return NAME(a b, 3);        \
        case 4:                         \
            return NAME(a b, 4);        \
        case 5:                         \
            return NAME(a b, 5);        \
        case 6:                         \
            return NAME(a b, 6);        \
        case 7:                         \
            return NAME(a b, 7);        \
        case 8:                         \
            return NAME(a b, 8);        \
        case 9:                         \
            return NAME(a b, 9);        \
        case 10:                        \
            return NAME(a b, 10);       \
        case 11:                        \
            return NAME(a b, 11);       \
        case 12:                        \
            return NAME(a b, 12);       \
        case 13:                        \
            return NAME(a b, 13);       \
        case 14:                        \
            return NAME(a b, 14);       \
        case 15:                        \
            return NAME(a b, 15);       \
        default:                        \
            gi_trap();                  \
            return NAME(a b, 0);        \
    }
#if !defined(__SSE3__)
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
#ifdef __cplusplus
#define _sse2_mm_alignr_epi8(b, a, imm8)                                    \
    __extension__({                                                         \
        __m128i d;                                                          \
        const int imm2 = sizeof(__m128i) - imm8;                            \
        d = _mm_or_si128(_mm_srli_si128(a, imm8), _mm_slli_si128(b, imm2)); \
        d;                                                                  \
    })
#else
#define _sse2_mm_alignr_epi8(b, a, imm8)                                   \
    __extension__({                                                        \
        __m128i d;                                                         \
        const int imm2 = sizeof(__m128i) - imm8;                           \
        if (16 == imm2) {                                                  \
            d = _mm_or_si128(_mm_srli_si128(a, 0), _mm_slli_si128(b, 16)); \
        } else if (12 == imm2) {                                           \
            d = _mm_or_si128(_mm_srli_si128(a, 4), _mm_slli_si128(b, 12)); \
        } else if (8 == imm2) {                                            \
            d = _mm_or_si128(_mm_srli_si128(a, 8), _mm_slli_si128(b, 8));  \
        } else if (4 == imm2) {                                            \
            d = _mm_or_si128(_mm_srli_si128(a, 12), _mm_slli_si128(b, 4)); \
        } else {                                                           \
            gi_trap();                                                     \
        }                                                                  \
        d;                                                                 \
    })
#endif
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
#endif

#define _SSE_COMMA ,
GI_FORCEINLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) {
#if !defined(__SSE3__)
    _SSE_SWITCH16(_sse2_mm_alignr_epi8, a, _SSE_COMMA b, LANE)
#else
    _SSE_SWITCH16(_mm_alignr_epi8, a, _SSE_COMMA b, LANE)
#endif
}
typedef float float32_t;
typedef double float64_t;
typedef union __m64_128 {
    uint64_t m64_u64[1];
    int64_t m64_i64[1];
    float64_t m64_d64[1];
    uint32_t m64_u32[2];
    int32_t m64_i32[2];
    float32_t m64_f32[2];
    int16_t m64_i16[4];
    uint16_t m64_u16[4];
    int8_t m64_i8[8];
    uint8_t m64_u8[8];
} __m64_128;
typedef __m64_128 float32x2_t;

#define return64(a) \
    _M64(res64, a); \
    return res64;
#define return64f(a) \
    _M64f(res64, a); \
    return res64;
#define _sse_vextq_s32(a, b, c)       _MM_ALIGNR_EPI8(b, a, c * 4)
#define _sse_vget_lane_f32(vec, lane) vec.m64_f32[lane]
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
#elif defined(GI_RVV_INTRINSICS)
#define __gi_simd_type GI_RVV
typedef vfloat32m1_t GI_FLOAT32_t;
typedef vuint8m1_t GI_UINT8_t;
typedef vint8m1_t GI_INT8_t;
typedef vint16m1_t GI_INT16_t;
typedef vint32m1_t GI_INT32_t;
typedef vuint32m1_t GI_UINT32_t;
//! FIXME: nezha D1 do not support vmv.x.s instruct
//! as a workaround, define GI_INT64_t to naive
typedef int64_t GI_INT64_RVV_WORKAROUND_t
        __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef GI_INT64_RVV_WORKAROUND_t GI_INT64_t;
typedef vfloat32m1x2_t GI_FLOAT32_V2_t;
typedef vfloat32m1x3_t GI_FLOAT32_V3_t;
typedef vfloat32m1x4_t GI_FLOAT32_V4_t;
typedef vint32m1x2_t GI_INT32_V2_t;
typedef vint32m1x4_t GI_INT32_V4_t;
typedef vint16m1x2_t GI_INT16_V2_t;
typedef vint8m1x2_t GI_INT8_V2_t;
307 308
typedef vint8m1x3_t GI_INT8_V3_t;
typedef vint8m1x4_t GI_INT8_V4_t;
309 310 311 312
//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as
//! a workaround, we use vfloat32m1_t instead
typedef vfloat32m1_t float32x2_t;

313
#else
314
#define __gi_simd_type GI_NAIVE
315 316 317 318 319 320 321 322 323
typedef float GI_FLOAT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint8_t GI_UINT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int8_t GI_INT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int16_t GI_INT16_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int32_t GI_INT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint32_t GI_UINT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int64_t GI_INT64_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
#if !defined(__arm__) && !defined(__aarch64__) || !defined(__ARM_NEON)
typedef float float32x2_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2)));
324
#endif
325
typedef float float32_t;
326 327
#endif

328 329 330 331
//! some GI api do not support full GiSimdType
//! for example: GiAbsInt32 do not imp SSE2 case
//! when *_t will define as _m128*(may be long long)
//! vector index do not have same logic as naive vector
332 333 334 335 336 337 338 339
typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef float float32x2_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2)));
340 341 342 343 344 345 346 347 348 349 350 351
typedef struct {
    GI_INT32_NAIVE_t val[2];
} GI_INT32_V2_NAIVE_t;

typedef struct {
    GI_INT32_NAIVE_t val[4];
} GI_INT32_V4_NAIVE_t;

typedef struct {
    GI_FLOAT32_NAIVE_t val[2];
} GI_FLOAT32_V2_NAIVE_t;

352 353 354 355
typedef struct {
    GI_FLOAT32_NAIVE_t val[3];
} GI_FLOAT32_V3_NAIVE_t;

356 357 358 359 360 361 362 363 364 365 366 367
typedef struct {
    GI_FLOAT32_NAIVE_t val[4];
} GI_FLOAT32_V4_NAIVE_t;

typedef struct {
    GI_INT16_NAIVE_t val[2];
} GI_INT16_V2_NAIVE_t;

typedef struct {
    GI_INT8_NAIVE_t val[2];
} GI_INT8_V2_NAIVE_t;

368
#if !defined(GI_NEON_INTRINSICS) && !defined(GI_RVV_INTRINSICS)
369
typedef struct {
370 371
    GI_INT32_t val[2];
} GI_INT32_V2_t;
372 373

typedef struct {
374 375
    GI_INT32_t val[4];
} GI_INT32_V4_t;
376 377

typedef struct {
378 379
    GI_FLOAT32_t val[2];
} GI_FLOAT32_V2_t;
380

381 382 383 384
typedef struct {
    GI_FLOAT32_t val[3];
} GI_FLOAT32_V3_t;

385
typedef struct {
386 387 388 389 390 391 392 393 394 395
    GI_FLOAT32_t val[4];
} GI_FLOAT32_V4_t;

typedef struct {
    GI_INT16_t val[2];
} GI_INT16_V2_t;

typedef struct {
    GI_INT8_t val[2];
} GI_INT8_V2_t;
396

397 398 399 400 401 402 403 404
typedef struct {
    GI_INT8_t val[3];
} GI_INT8_V3_t;

typedef struct {
    GI_INT8_t val[4];
} GI_INT8_V4_t;

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
#endif
//! variable length type intrinsic can not be a member of c++ class
//! caused by can not do sizeof at build stage, for example RVV and SVE
//! so we define a type_CLASS to solve this case
//! some variable length type intrinsic can not do array subscript, for
//! example RVV, so we define a GiGetSubVector_xx function to solve this
//! case. when fix-len type in fact will do nothing
#if defined(GI_RVV_INTRINSICS)
typedef GI_FLOAT32_NAIVE_t GI_FLOAT32_FIXLEN_t;
typedef GI_FLOAT32_V2_NAIVE_t GI_FLOAT32_FIXLEN_V2_t;
typedef GI_UINT8_NAIVE_t GI_UINT8_FIXLEN_t;
typedef GI_INT8_NAIVE_t GI_INT8_FIXLEN_t;
typedef GI_INT16_NAIVE_t GI_INT16_FIXLEN_t;
typedef GI_INT32_NAIVE_t GI_INT32_FIXLEN_t;
typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;

//! get subvector
#define GiGetSubVectorFloat32V2(s, index) vget_f32m1x2_f32m1(s, index)
#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index)
#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index)

#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index)
#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index)

#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index)

#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index)
432 433
#define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index)
#define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index)
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546

//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s)
#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s)
#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s)

#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s)
#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s)

#define GiSetSubVectorInt16V2(d, index, s) d = vset_i16m1x2(d, index, s)

#define GiSetSubVectorInt8V2(d, index, s) d = vset_i8m1x2(d, index, s)

//! convert
#define GiFloat32Type2FixLenType(s)                                     \
    __extension__({                                                     \
        GI_FLOAT32_FIXLEN_t d;                                          \
        vse32_v_f32m1((float*)&d, s, GI_SIMD_LEN_BYTE / sizeof(float)); \
        d;                                                              \
    })

#define GiFixLenType2GiFloat32Type(s)                                    \
    __extension__({                                                      \
        GI_FLOAT32_t d;                                                  \
        d = vle32_v_f32m1((float*)&s, GI_SIMD_LEN_BYTE / sizeof(float)); \
        d;                                                               \
    })

#define GiFloat32Type2FixLenV2Type(s)                                       \
    __extension__({                                                         \
        GI_FLOAT32_FIXLEN_V2_t d;                                           \
        d.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 0)); \
        d.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 1)); \
        d;                                                                  \
    })

#define GiFixLenType2GiFloat32V2Type(s)                                      \
    __extension__({                                                          \
        GI_FLOAT32_V2_t d;                                                   \
        GiSetSubVectorFloat32V2(d, 0, GiFixLenType2GiFloat32Type(s.val[0])); \
        GiSetSubVectorFloat32V2(d, 1, GiFixLenType2GiFloat32Type(s.val[1])); \
        d;                                                                   \
    })

#define GiUint8Type2FixLenType(s)                                         \
    __extension__({                                                       \
        GI_UINT8_FIXLEN_t d;                                              \
        vse8_v_u8m1((uint8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \
        d;                                                                \
    })

#define GiFixLenType2GiUint8Type(s)                                        \
    __extension__({                                                        \
        GI_UINT8_t d;                                                      \
        d = vle8_v_u8m1((uint8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \
        d;                                                                 \
    })

#define GiInt8Type2FixLenType(s)                                        \
    __extension__({                                                     \
        GI_INT8_FIXLEN_t d;                                             \
        vse8_v_i8m1((int8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \
        d;                                                              \
    })

#define GiFixLenType2GiInt8Type(s)                                       \
    __extension__({                                                      \
        GI_INT8_t d;                                                     \
        d = vle8_v_i8m1((int8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \
        d;                                                               \
    })

#define GiInt16Type2FixLenType(s)                                           \
    __extension__({                                                         \
        GI_INT16_FIXLEN_t d;                                                \
        vse16_v_i16m1((int16_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \
        d;                                                                  \
    })

#define GiFixLenType2GiInt16Type(s)                                          \
    __extension__({                                                          \
        GI_INT16_t d;                                                        \
        d = vle16_v_i16m1((int16_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \
        d;                                                                   \
    })

#define GiInt32Type2FixLenType(s)                                           \
    __extension__({                                                         \
        GI_INT32_FIXLEN_t d;                                                \
        vse32_v_i32m1((int32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \
        d;                                                                  \
    })

#define GiFixLenType2GiInt32Type(s)                                          \
    __extension__({                                                          \
        GI_INT32_t d;                                                        \
        d = vle32_v_i32m1((int32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \
        d;                                                                   \
    })

#define GiUint32Type2FixLenType(s)                                            \
    __extension__({                                                           \
        GI_UINT32_FIXLEN_t d;                                                 \
        vse32_v_u32m1((uint32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \
        d;                                                                    \
    })

#define GiFixLenType2GiUint32Type(s)                                           \
    __extension__({                                                            \
        GI_UINT32_t d;                                                         \
        d = vle32_v_u32m1((uint32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \
        d;                                                                     \
    })
547
#else
548 549 550 551 552 553 554
typedef GI_FLOAT32_t GI_FLOAT32_FIXLEN_t;
typedef GI_FLOAT32_V2_t GI_FLOAT32_FIXLEN_V2_t;
typedef GI_UINT8_t GI_UINT8_FIXLEN_t;
typedef GI_INT8_t GI_INT8_FIXLEN_t;
typedef GI_INT16_t GI_INT16_FIXLEN_t;
typedef GI_INT32_t GI_INT32_FIXLEN_t;
typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
555 556
#define GiFloat32Type2FixLenType(s)   s
#define GiFixLenType2GiFloat32Type(s) s
557

558 559
#define GiFloat32Type2FixLenV2Type(s)   s
#define GiFixLenType2GiFloat32V2Type(s) s
560

561 562
#define GiUint8Type2FixLenType(s)   s
#define GiFixLenType2GiUint8Type(s) s
563

564 565
#define GiInt8Type2FixLenType(s)   s
#define GiFixLenType2GiInt8Type(s) s
566

567 568
#define GiInt16Type2FixLenType(s)   s
#define GiFixLenType2GiInt16Type(s) s
569

570 571
#define GiInt32Type2FixLenType(s)   s
#define GiFixLenType2GiInt32Type(s) s
572

573 574
#define GiUint32Type2FixLenType(s)        s
#define GiFixLenType2GiUint32Type(s)      s
575 576 577 578 579 580 581 582 583 584 585 586

//! get subvector
#define GiGetSubVectorFloat32V2(s, index) s.val[index]
#define GiGetSubVectorFloat32V3(s, index) s.val[index]
#define GiGetSubVectorFloat32V4(s, index) s.val[index]

#define GiGetSubVectorInt32V2(s, index) s.val[index]
#define GiGetSubVectorInt32V4(s, index) s.val[index]

#define GiGetSubVectorInt16V2(s, index) s.val[index]

#define GiGetSubVectorInt8V2(s, index)       s.val[index]
587 588
#define GiGetSubVectorInt8V3(s, index)       s.val[index]
#define GiGetSubVectorInt8V4(s, index)       s.val[index]
589 590 591 592 593 594 595 596 597 598 599 600

//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s

#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s

#define GiSetSubVectorInt16V2(d, index, s) d.val[index] = s

#define GiSetSubVectorInt8V2(d, index, s) d.val[index] = s
601 602
#endif

603 604 605
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)

606
#if defined(GI_NEON_INTRINSICS)
607 608 609 610
#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS)
#define v_fma_ps_f32(c, b, a)         vfmaq_f32((c), (b), (a))
#define v_fma_n_f32(c, b, a)          vfmaq_n_f32((c), (b), (a))
#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane))
611
#else
612 613 614 615
#define v_fma_ps_f32(c, b, a)         vmlaq_f32((c), (b), (a))
#define v_fma_n_f32(c, b, a)          vmlaq_n_f32((c), (b), (a))
#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane))
#endif
616 617 618
#endif

GI_FORCEINLINE
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
enum GiSimdType GiGetSimdType() {
    //! override by special macro to insure ci have test naive and sse2
    //! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42
    //! now arm ci device will test GI_NEON
    //! insure test GI_SSE2 by command:
    //! --copt -march=core2 --copt -mno-sse4.2
    //! --copt -mno-sse3 --copt -DGI_TEST_SSE2
    //! insure test GI_NAIVE by command:
    //! --copt -DGI_TEST_SSE2
    //! DNN code at least need sse2 at x86
    //! so we can not test GI_NAIVE by
    //! --copt -march=core2 --copt -mno-sse4.2
    //! --copt -mno-sse3 --copt -mno-sse2
    //! --copt -DGI_TEST_NAIVE
    //! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by
    //! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh
#if defined(GI_TEST_NAIVE)
#undef __gi_simd_type
#define __gi_simd_type GI_NAIVE
#elif defined(GI_TEST_SSE2)
#undef __gi_simd_type
#define __gi_simd_type GI_SSE2
641
#endif
642 643

    return __gi_simd_type;
644 645 646
}

GI_FORCEINLINE
647
GI_FLOAT32_t GiBroadcastFloat32(float Value) {
648
#if defined(GI_NEON_INTRINSICS)
649
    return vdupq_n_f32(Value);
650
#elif defined(GI_SSE2_INTRINSICS)
651 652 653
    return _mm_set1_ps(Value);
#elif defined(GI_RVV_INTRINSICS)
    return vfmv_v_f_f32m1(Value, GI_SIMD_LEN_BYTE / sizeof(float));
654
#else
655 656 657 658 659
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Value;
    }
    return ret;
660 661 662
#endif
}

663
GI_FORCEINLINE
664
GI_INT8_t GiBroadcastInt8(int8_t Value) {
665
#if defined(GI_NEON_INTRINSICS)
666
    return vdupq_n_s8(Value);
667
#elif defined(GI_SSE2_INTRINSICS)
668 669 670
    return _mm_set1_epi8(Value);
#elif defined(GI_RVV_INTRINSICS)
    return vmv_v_x_i8m1(Value, GI_SIMD_LEN_BYTE / sizeof(int8_t));
671
#else
672 673
    GI_INT8_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
674 675 676 677 678 679 680 681 682 683 684 685
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiBroadcastInt32(int32_t Value) {
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_s32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_set1_epi32(Value);
686 687
#elif defined(GI_RVV_INTRINSICS)
    return vmv_v_x_i32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t));
688 689 690 691 692 693 694 695 696 697
#else
    GI_INT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
698
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
699
#if defined(GI_NEON_INTRINSICS)
700
    return vandq_s32(Vector1, Vector2);
701
#elif defined(GI_SSE2_INTRINSICS)
702 703 704
    return _mm_and_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vand_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
705
#else
706
    return Vector1 & Vector2;
707 708 709
#endif
}

710
GI_FORCEINLINE
711 712 713 714 715 716 717 718 719
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vorrq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_or_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return Vector1 | Vector2;
720 721 722
#endif
}

723 724 725 726 727 728 729 730 731 732 733 734 735
GI_FORCEINLINE
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vandq_s32(vmvnq_s32(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_si128(VectorNot, Vector);
#elif defined(GI_RVV_INTRINSICS)
    GI_INT32_t not_v = vnot_v_i32m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int32_t));
    return vand_vv_i32m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return (~VectorNot) & Vector;
#endif
}
736

737 738 739 740 741 742 743 744 745 746 747 748
GI_FORCEINLINE
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return veorq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_xor_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vxor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return Vector1 ^ Vector2;
#endif
}
749
// vim: syntax=cpp.doxygen