gi_common.h 27.5 KB
Newer Older
1 2 3 4
#pragma once

#include "math.h"
#include "stdint.h"
5
#include "string.h"
6 7 8 9 10 11

#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#else
#if defined(__arm__) || defined(__aarch64__)
12
#include "src/arm_common/simd_macro/marm_neon.h"
13 14 15 16
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
17 18 19
#endif
#endif

20 21 22 23
#if defined(__riscv_vector)
#include <riscv_vector.h>
#endif

24
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
25 26
#define GI_TARGET_X86
#endif
27 28 29

#if defined(__arm__) || defined(__aarch64__)
#define GI_TARGET_ARM
30 31 32 33
#endif

#ifdef _WIN32
//! GI stand for general intrinsic
34
#define _GI_ALIGN_16                           __declspec(align(16))
35 36
#define GI_DECLSPEC_ALIGN(variable, alignment) DECLSPEC_ALIGN(alignment) variable
#else
37
#define _GI_ALIGN_16 __attribute__((aligned(16)))
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
#define GI_DECLSPEC_ALIGN(variable, alignment) \
    variable __attribute__((aligned(alignment)))
#endif

#if defined(_MSC_VER)
#define GI_FORCEINLINE __forceinline
#else
#define GI_FORCEINLINE __attribute__((always_inline)) inline
#endif

#if defined(_MSC_VER)
#define GI_INTERNAL_DATA extern "C"
#else
#define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden")))
#endif

54
#if defined(GI_TARGET_ARM) && defined(__ARM_NEON)
55 56 57
#define GI_NEON_INTRINSICS
#if defined(__aarch64__)
#define GI_NEON64_INTRINSICS
58
#define GI_NEON32_INTRINSICS
59 60 61 62
#else
#define GI_NEON32_INTRINSICS
#endif
#elif defined(GI_TARGET_X86)
63 64 65 66 67 68 69 70 71
// #if defined(__FMA__)
// #define GI_FMA_INTRINSICS
// #define GI_AVX2_INTRINSICS
// #define GI_AVX_INTRINSICS
// #elif defined(__AVX2__)
// #define GI_AVX2_INTRINSICS
// #define GI_AVX_INTRINSICS
// #elif defined(__AVX__)
// #define GI_AVX_INTRINSICS
72 73 74 75 76 77 78
#if defined(__SSE4_2__)
#define GI_SSE42_INTRINSICS
#define GI_SSE2_INTRINSICS
#elif defined(__SSE2__)
#define GI_SSE2_INTRINSICS
#endif
#endif
79 80 81
#if defined(__riscv_vector)
#define GI_RVV_INTRINSICS
#endif
82

83 84 85 86 87 88 89 90 91
#if defined(GI_TEST_NAIVE)
#undef GI_NEON_INTRINSICS
#undef GI_NEON64_INTRINSICS
#undef GI_NEON32_INTRINSICS
#undef GI_FMA_INTRINSICS
#undef GI_AVX2_INTRINSICS
#undef GI_AVX_INTRINSICS
#undef GI_SSE42_INTRINSICS
#undef GI_SSE2_INTRINSICS
92
#undef GI_RVV_INTRINSICS
93 94
#endif

95 96
//! Gi fp16 only support arm64 neon and rvv
#if (defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && \
97
     defined(__aarch64__)) ||                                               \
98 99 100 101
        defined(GI_RVV_INTRINSICS)
#define GI_SUPPORT_F16
#endif

102 103 104 105 106 107 108 109 110 111 112
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
        defined(GI_FMA_INTRINSICS)
//! if neon and sse the simd lenght is 128
#define GI_SIMD_LEN      256
#define GI_SIMD_LEN_BYTE 32
#elif defined(GI_NEON_INTRINSICS) || defined(GI_SSE2_INTRINSICS) || \
        defined(GI_SSE42_INTRINSICS)
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
113 114 115 116
#elif defined(GI_RVV_INTRINSICS)
//! TODO: make gi algo usable for other GI_SIMD_LEN/GI_SIMD_LEN_BYTE
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
#else
//! if no simd hardware support, the simd is implemented by C, default set to
//! 128
#define GI_SIMD_LEN      128
#define GI_SIMD_LEN_BYTE 16
#endif

#define gi_trap() __builtin_trap()

//! for ci test now
enum GiSimdType {
    GI_UNKNOWN,
    GI_NAIVE,
    GI_AVX,
    GI_SSE42,
    GI_SSE2,
    GI_NEON,
134
    GI_RVV,
135 136
};

137 138 139 140 141 142
#if defined(GI_RVV_INTRINSICS)
typedef float16_t gi_float16_t;
#elif defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef __fp16 gi_float16_t;
#endif

143 144
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
        defined(GI_FMA_INTRINSICS)
145
#define __gi_simd_type GI_AVX
146 147 148 149 150 151
typedef __m256 GI_FLOAT32_t;
typedef __m256i GI_UINT8_t;
typedef __m256i GI_INT8_t;
typedef __m256i GI_INT16_t;
typedef __m256i GI_INT32_t;
typedef __m256i GI_UINT32_t;
152
#elif defined(GI_NEON_INTRINSICS)
153
#define __gi_simd_type GI_NEON
154
typedef float32x4_t GI_FLOAT32_t;
155 156 157 158
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef float16x8_t GI_FLOAT16_t;
typedef float16x8x2_t GI_FLOAT16_V2_t;
#endif
159 160 161 162 163
typedef uint8x16_t GI_UINT8_t;
typedef int8x16_t GI_INT8_t;
typedef int16x8_t GI_INT16_t;
typedef int32x4_t GI_INT32_t;
typedef uint32x4_t GI_UINT32_t;
164
typedef float32x4x2_t GI_FLOAT32_V2_t;
165
typedef float32x4x3_t GI_FLOAT32_V3_t;
166 167 168 169 170
typedef float32x4x4_t GI_FLOAT32_V4_t;
typedef int32x4x2_t GI_INT32_V2_t;
typedef int32x4x4_t GI_INT32_V4_t;
typedef int16x8x2_t GI_INT16_V2_t;
typedef int8x16x2_t GI_INT8_V2_t;
171 172
typedef int8x16x3_t GI_INT8_V3_t;
typedef int8x16x4_t GI_INT8_V4_t;
173
typedef int64x2_t GI_INT64_t;
174
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
175 176 177 178 179 180 181 182 183 184

#if defined(GI_SSE42_INTRINSICS)
#define __gi_simd_type GI_SSE42
#elif defined(GI_SSE2_INTRINSICS)
#define __gi_simd_type GI_SSE2
#else
#define __gi_simd_type GI_UNKNOWN
#error "code issue happened!!"
#endif

185 186 187 188 189 190
typedef __m128 GI_FLOAT32_t;
typedef __m128i GI_UINT8_t;
typedef __m128i GI_INT8_t;
typedef __m128i GI_INT16_t;
typedef __m128i GI_INT32_t;
typedef __m128i GI_UINT32_t;
191
typedef __m128i GI_INT64_t;
192
#define _SWAP_HI_LOW32                    (2 | (3 << 2) | (0 << 4) | (1 << 6))
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4))
#define _M64(out, inp)                    _mm_storel_epi64((__m128i*)&(out), inp)
#define _pM128i(a)                        _mm_loadl_epi64((__m128i*)&(a))
#define _pM128(a)                         _mm_castsi128_ps(_pM128i(a))
#define _M128i(a)                         _mm_castps_si128(a)
#define _M128(a)                          _mm_castsi128_ps(a)
#if defined(__x86_64__)
#define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64(_M128i(inp));
#else
#define _M64f(out, inp) _mm_storel_epi64((__m128i*)&(out), _M128i(inp))
#endif
#define _SSE_SWITCH16(NAME, a, b, LANE) \
    switch (LANE) {                     \
        case 0:                         \
            return NAME(a b, 0);        \
        case 1:                         \
            return NAME(a b, 1);        \
        case 2:                         \
            return NAME(a b, 2);        \
        case 3:                         \
            return NAME(a b, 3);        \
        case 4:                         \
            return NAME(a b, 4);        \
        case 5:                         \
            return NAME(a b, 5);        \
        case 6:                         \
            return NAME(a b, 6);        \
        case 7:                         \
            return NAME(a b, 7);        \
        case 8:                         \
            return NAME(a b, 8);        \
        case 9:                         \
            return NAME(a b, 9);        \
        case 10:                        \
            return NAME(a b, 10);       \
        case 11:                        \
            return NAME(a b, 11);       \
        case 12:                        \
            return NAME(a b, 12);       \
        case 13:                        \
            return NAME(a b, 13);       \
        case 14:                        \
            return NAME(a b, 14);       \
        case 15:                        \
            return NAME(a b, 15);       \
        default:                        \
            gi_trap();                  \
            return NAME(a b, 0);        \
    }
#if !defined(__SSE3__)
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
#ifdef __cplusplus
#define _sse2_mm_alignr_epi8(b, a, imm8)                                    \
    __extension__({                                                         \
        __m128i d;                                                          \
        const int imm2 = sizeof(__m128i) - imm8;                            \
        d = _mm_or_si128(_mm_srli_si128(a, imm8), _mm_slli_si128(b, imm2)); \
        d;                                                                  \
    })
#else
#define _sse2_mm_alignr_epi8(b, a, imm8)                                   \
    __extension__({                                                        \
        __m128i d;                                                         \
        const int imm2 = sizeof(__m128i) - imm8;                           \
        if (16 == imm2) {                                                  \
            d = _mm_or_si128(_mm_srli_si128(a, 0), _mm_slli_si128(b, 16)); \
        } else if (12 == imm2) {                                           \
            d = _mm_or_si128(_mm_srli_si128(a, 4), _mm_slli_si128(b, 12)); \
        } else if (8 == imm2) {                                            \
            d = _mm_or_si128(_mm_srli_si128(a, 8), _mm_slli_si128(b, 8));  \
        } else if (4 == imm2) {                                            \
            d = _mm_or_si128(_mm_srli_si128(a, 12), _mm_slli_si128(b, 4)); \
        } else {                                                           \
            gi_trap();                                                     \
        }                                                                  \
        d;                                                                 \
    })
#endif
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
#endif

#define _SSE_COMMA ,
GI_FORCEINLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) {
#if !defined(__SSE3__)
    _SSE_SWITCH16(_sse2_mm_alignr_epi8, a, _SSE_COMMA b, LANE)
#else
    _SSE_SWITCH16(_mm_alignr_epi8, a, _SSE_COMMA b, LANE)
#endif
}
typedef float float32_t;
typedef double float64_t;
typedef union __m64_128 {
    uint64_t m64_u64[1];
    int64_t m64_i64[1];
    float64_t m64_d64[1];
    uint32_t m64_u32[2];
    int32_t m64_i32[2];
    float32_t m64_f32[2];
    int16_t m64_i16[4];
    uint16_t m64_u16[4];
    int8_t m64_i8[8];
    uint8_t m64_u8[8];
} __m64_128;
typedef __m64_128 float32x2_t;

#define return64(a) \
    _M64(res64, a); \
    return res64;
#define return64f(a) \
    _M64f(res64, a); \
    return res64;
#define _sse_vextq_s32(a, b, c)       _MM_ALIGNR_EPI8(b, a, c * 4)
#define _sse_vget_lane_f32(vec, lane) vec.m64_f32[lane]
304 305 306
#elif defined(GI_RVV_INTRINSICS)
#define __gi_simd_type GI_RVV
typedef vfloat32m1_t GI_FLOAT32_t;
307 308
typedef vfloat16m1_t GI_FLOAT16_t;
typedef vfloat16m1x2_t GI_FLOAT16_V2_t;
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
typedef vuint8m1_t GI_UINT8_t;
typedef vint8m1_t GI_INT8_t;
typedef vint16m1_t GI_INT16_t;
typedef vint32m1_t GI_INT32_t;
typedef vuint32m1_t GI_UINT32_t;
//! FIXME: nezha D1 do not support vmv.x.s instruct
//! as a workaround, define GI_INT64_t to naive
typedef int64_t GI_INT64_RVV_WORKAROUND_t
        __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef GI_INT64_RVV_WORKAROUND_t GI_INT64_t;
typedef vfloat32m1x2_t GI_FLOAT32_V2_t;
typedef vfloat32m1x3_t GI_FLOAT32_V3_t;
typedef vfloat32m1x4_t GI_FLOAT32_V4_t;
typedef vint32m1x2_t GI_INT32_V2_t;
typedef vint32m1x4_t GI_INT32_V4_t;
typedef vint16m1x2_t GI_INT16_V2_t;
typedef vint8m1x2_t GI_INT8_V2_t;
326 327
typedef vint8m1x3_t GI_INT8_V3_t;
typedef vint8m1x4_t GI_INT8_V4_t;
328 329 330 331
//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as
//! a workaround, we use vfloat32m1_t instead
typedef vfloat32m1_t float32x2_t;

332
#else
333
#define __gi_simd_type GI_NAIVE
334 335 336 337 338 339 340 341 342
typedef float GI_FLOAT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint8_t GI_UINT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int8_t GI_INT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int16_t GI_INT16_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int32_t GI_INT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint32_t GI_UINT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int64_t GI_INT64_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
#if !defined(__arm__) && !defined(__aarch64__) || !defined(__ARM_NEON)
typedef float float32x2_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2)));
343
#endif
344
typedef float float32_t;
345 346
#endif

347 348 349 350
//! some GI api do not support full GiSimdType
//! for example: GiAbsInt32 do not imp SSE2 case
//! when *_t will define as _m128*(may be long long)
//! vector index do not have same logic as naive vector
351 352 353 354 355 356 357 358
typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE)));
typedef float float32x2_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2)));
359 360 361 362 363 364 365 366 367 368 369 370
typedef struct {
    GI_INT32_NAIVE_t val[2];
} GI_INT32_V2_NAIVE_t;

typedef struct {
    GI_INT32_NAIVE_t val[4];
} GI_INT32_V4_NAIVE_t;

typedef struct {
    GI_FLOAT32_NAIVE_t val[2];
} GI_FLOAT32_V2_NAIVE_t;

371 372 373 374
typedef struct {
    GI_FLOAT32_NAIVE_t val[3];
} GI_FLOAT32_V3_NAIVE_t;

375 376 377 378 379 380 381 382 383 384 385 386
typedef struct {
    GI_FLOAT32_NAIVE_t val[4];
} GI_FLOAT32_V4_NAIVE_t;

typedef struct {
    GI_INT16_NAIVE_t val[2];
} GI_INT16_V2_NAIVE_t;

typedef struct {
    GI_INT8_NAIVE_t val[2];
} GI_INT8_V2_NAIVE_t;

387
#if !defined(GI_NEON_INTRINSICS) && !defined(GI_RVV_INTRINSICS)
388
typedef struct {
389 390
    GI_INT32_t val[2];
} GI_INT32_V2_t;
391 392

typedef struct {
393 394
    GI_INT32_t val[4];
} GI_INT32_V4_t;
395 396

typedef struct {
397 398
    GI_FLOAT32_t val[2];
} GI_FLOAT32_V2_t;
399

400 401 402 403
typedef struct {
    GI_FLOAT32_t val[3];
} GI_FLOAT32_V3_t;

404
typedef struct {
405 406 407 408 409 410 411 412 413 414
    GI_FLOAT32_t val[4];
} GI_FLOAT32_V4_t;

typedef struct {
    GI_INT16_t val[2];
} GI_INT16_V2_t;

typedef struct {
    GI_INT8_t val[2];
} GI_INT8_V2_t;
415

416 417 418 419 420 421 422 423
typedef struct {
    GI_INT8_t val[3];
} GI_INT8_V3_t;

typedef struct {
    GI_INT8_t val[4];
} GI_INT8_V4_t;

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
#endif
//! variable length type intrinsic can not be a member of c++ class
//! caused by can not do sizeof at build stage, for example RVV and SVE
//! so we define a type_CLASS to solve this case
//! some variable length type intrinsic can not do array subscript, for
//! example RVV, so we define a GiGetSubVector_xx function to solve this
//! case. when fix-len type in fact will do nothing
#if defined(GI_RVV_INTRINSICS)
typedef GI_FLOAT32_NAIVE_t GI_FLOAT32_FIXLEN_t;
typedef GI_FLOAT32_V2_NAIVE_t GI_FLOAT32_FIXLEN_V2_t;
typedef GI_UINT8_NAIVE_t GI_UINT8_FIXLEN_t;
typedef GI_INT8_NAIVE_t GI_INT8_FIXLEN_t;
typedef GI_INT16_NAIVE_t GI_INT16_FIXLEN_t;
typedef GI_INT32_NAIVE_t GI_INT32_FIXLEN_t;
typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;

//! get subvector
#define GiGetSubVectorFloat32V2(s, index) vget_f32m1x2_f32m1(s, index)
#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index)
#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index)

445 446
#define GiGetSubVectorFloat16V2(s, index) vget_f16m1x2_f16m1(s, index)

447 448 449 450 451 452
#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index)
#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index)

#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index)

#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index)
453 454
#define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index)
#define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index)
455 456 457 458 459 460

//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s)
#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s)
#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s)

461 462
#define GiSetSubVectorFloat16V2(d, index, s) d = vset_f16m1x2(d, index, s)

463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s)
#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s)

#define GiSetSubVectorInt16V2(d, index, s) d = vset_i16m1x2(d, index, s)

#define GiSetSubVectorInt8V2(d, index, s) d = vset_i8m1x2(d, index, s)

//! convert
#define GiFloat32Type2FixLenType(s)                                     \
    __extension__({                                                     \
        GI_FLOAT32_FIXLEN_t d;                                          \
        vse32_v_f32m1((float*)&d, s, GI_SIMD_LEN_BYTE / sizeof(float)); \
        d;                                                              \
    })

#define GiFixLenType2GiFloat32Type(s)                                    \
    __extension__({                                                      \
        GI_FLOAT32_t d;                                                  \
        d = vle32_v_f32m1((float*)&s, GI_SIMD_LEN_BYTE / sizeof(float)); \
        d;                                                               \
    })

#define GiFloat32Type2FixLenV2Type(s)                                       \
    __extension__({                                                         \
        GI_FLOAT32_FIXLEN_V2_t d;                                           \
        d.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 0)); \
        d.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 1)); \
        d;                                                                  \
    })

#define GiFixLenType2GiFloat32V2Type(s)                                      \
    __extension__({                                                          \
        GI_FLOAT32_V2_t d;                                                   \
        GiSetSubVectorFloat32V2(d, 0, GiFixLenType2GiFloat32Type(s.val[0])); \
        GiSetSubVectorFloat32V2(d, 1, GiFixLenType2GiFloat32Type(s.val[1])); \
        d;                                                                   \
    })

#define GiUint8Type2FixLenType(s)                                         \
    __extension__({                                                       \
        GI_UINT8_FIXLEN_t d;                                              \
        vse8_v_u8m1((uint8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \
        d;                                                                \
    })

#define GiFixLenType2GiUint8Type(s)                                        \
    __extension__({                                                        \
        GI_UINT8_t d;                                                      \
        d = vle8_v_u8m1((uint8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \
        d;                                                                 \
    })

#define GiInt8Type2FixLenType(s)                                        \
    __extension__({                                                     \
        GI_INT8_FIXLEN_t d;                                             \
        vse8_v_i8m1((int8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \
        d;                                                              \
    })

#define GiFixLenType2GiInt8Type(s)                                       \
    __extension__({                                                      \
        GI_INT8_t d;                                                     \
        d = vle8_v_i8m1((int8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \
        d;                                                               \
    })

#define GiInt16Type2FixLenType(s)                                           \
    __extension__({                                                         \
        GI_INT16_FIXLEN_t d;                                                \
        vse16_v_i16m1((int16_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \
        d;                                                                  \
    })

#define GiFixLenType2GiInt16Type(s)                                          \
    __extension__({                                                          \
        GI_INT16_t d;                                                        \
        d = vle16_v_i16m1((int16_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \
        d;                                                                   \
    })

#define GiInt32Type2FixLenType(s)                                           \
    __extension__({                                                         \
        GI_INT32_FIXLEN_t d;                                                \
        vse32_v_i32m1((int32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \
        d;                                                                  \
    })

#define GiFixLenType2GiInt32Type(s)                                          \
    __extension__({                                                          \
        GI_INT32_t d;                                                        \
        d = vle32_v_i32m1((int32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \
        d;                                                                   \
    })

#define GiUint32Type2FixLenType(s)                                            \
    __extension__({                                                           \
        GI_UINT32_FIXLEN_t d;                                                 \
        vse32_v_u32m1((uint32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \
        d;                                                                    \
    })

#define GiFixLenType2GiUint32Type(s)                                           \
    __extension__({                                                            \
        GI_UINT32_t d;                                                         \
        d = vle32_v_u32m1((uint32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \
        d;                                                                     \
    })
570
#else
571 572 573 574 575 576 577
typedef GI_FLOAT32_t GI_FLOAT32_FIXLEN_t;
typedef GI_FLOAT32_V2_t GI_FLOAT32_FIXLEN_V2_t;
typedef GI_UINT8_t GI_UINT8_FIXLEN_t;
typedef GI_INT8_t GI_INT8_FIXLEN_t;
typedef GI_INT16_t GI_INT16_FIXLEN_t;
typedef GI_INT32_t GI_INT32_FIXLEN_t;
typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
578 579
#define GiFloat32Type2FixLenType(s)   s
#define GiFixLenType2GiFloat32Type(s) s
580

581 582
#define GiFloat32Type2FixLenV2Type(s)   s
#define GiFixLenType2GiFloat32V2Type(s) s
583

584 585
#define GiUint8Type2FixLenType(s)   s
#define GiFixLenType2GiUint8Type(s) s
586

587 588
#define GiInt8Type2FixLenType(s)   s
#define GiFixLenType2GiInt8Type(s) s
589

590 591
#define GiInt16Type2FixLenType(s)   s
#define GiFixLenType2GiInt16Type(s) s
592

593 594
#define GiInt32Type2FixLenType(s)   s
#define GiFixLenType2GiInt32Type(s) s
595

596 597
#define GiUint32Type2FixLenType(s)        s
#define GiFixLenType2GiUint32Type(s)      s
598 599 600 601 602 603

//! get subvector
#define GiGetSubVectorFloat32V2(s, index) s.val[index]
#define GiGetSubVectorFloat32V3(s, index) s.val[index]
#define GiGetSubVectorFloat32V4(s, index) s.val[index]

604 605 606 607
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiGetSubVectorFloat16V2(s, index) s.val[index]
#endif

608 609 610 611 612 613
#define GiGetSubVectorInt32V2(s, index) s.val[index]
#define GiGetSubVectorInt32V4(s, index) s.val[index]

#define GiGetSubVectorInt16V2(s, index) s.val[index]

#define GiGetSubVectorInt8V2(s, index)       s.val[index]
614 615
#define GiGetSubVectorInt8V3(s, index)       s.val[index]
#define GiGetSubVectorInt8V4(s, index)       s.val[index]
616 617 618 619 620 621

//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s

622 623 624 625
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiSetSubVectorFloat16V2(d, index, s) d.val[index] = s
#endif

626 627 628 629 630 631
#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s

#define GiSetSubVectorInt16V2(d, index, s) d.val[index] = s

#define GiSetSubVectorInt8V2(d, index, s) d.val[index] = s
632 633
#endif

634 635 636
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)

637
#if defined(GI_NEON_INTRINSICS)
638 639 640 641
#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS)
#define v_fma_ps_f32(c, b, a)         vfmaq_f32((c), (b), (a))
#define v_fma_n_f32(c, b, a)          vfmaq_n_f32((c), (b), (a))
#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane))
642
#else
643 644 645 646
#define v_fma_ps_f32(c, b, a)         vmlaq_f32((c), (b), (a))
#define v_fma_n_f32(c, b, a)          vmlaq_n_f32((c), (b), (a))
#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane))
#endif
647 648 649
#endif

GI_FORCEINLINE
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
enum GiSimdType GiGetSimdType() {
    //! override by special macro to insure ci have test naive and sse2
    //! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42
    //! now arm ci device will test GI_NEON
    //! insure test GI_SSE2 by command:
    //! --copt -march=core2 --copt -mno-sse4.2
    //! --copt -mno-sse3 --copt -DGI_TEST_SSE2
    //! insure test GI_NAIVE by command:
    //! --copt -DGI_TEST_SSE2
    //! DNN code at least need sse2 at x86
    //! so we can not test GI_NAIVE by
    //! --copt -march=core2 --copt -mno-sse4.2
    //! --copt -mno-sse3 --copt -mno-sse2
    //! --copt -DGI_TEST_NAIVE
    //! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by
    //! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh
#if defined(GI_TEST_NAIVE)
#undef __gi_simd_type
#define __gi_simd_type GI_NAIVE
#elif defined(GI_TEST_SSE2)
#undef __gi_simd_type
#define __gi_simd_type GI_SSE2
672
#endif
673 674

    return __gi_simd_type;
675 676 677
}

GI_FORCEINLINE
678
GI_FLOAT32_t GiBroadcastFloat32(float Value) {
679
#if defined(GI_NEON_INTRINSICS)
680
    return vdupq_n_f32(Value);
681
#elif defined(GI_SSE2_INTRINSICS)
682 683 684
    return _mm_set1_ps(Value);
#elif defined(GI_RVV_INTRINSICS)
    return vfmv_v_f_f32m1(Value, GI_SIMD_LEN_BYTE / sizeof(float));
685
#else
686 687 688 689 690
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Value;
    }
    return ret;
691 692 693
#endif
}

694
GI_FORCEINLINE
695
GI_INT8_t GiBroadcastInt8(int8_t Value) {
696
#if defined(GI_NEON_INTRINSICS)
697
    return vdupq_n_s8(Value);
698
#elif defined(GI_SSE2_INTRINSICS)
699 700 701
    return _mm_set1_epi8(Value);
#elif defined(GI_RVV_INTRINSICS)
    return vmv_v_x_i8m1(Value, GI_SIMD_LEN_BYTE / sizeof(int8_t));
702
#else
703 704
    GI_INT8_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
705 706 707 708 709 710 711 712 713 714 715 716
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiBroadcastInt32(int32_t Value) {
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_s32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_set1_epi32(Value);
717 718
#elif defined(GI_RVV_INTRINSICS)
    return vmv_v_x_i32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t));
719 720 721 722 723 724 725 726 727 728
#else
    GI_INT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
729
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
730
#if defined(GI_NEON_INTRINSICS)
731
    return vandq_s32(Vector1, Vector2);
732
#elif defined(GI_SSE2_INTRINSICS)
733 734 735
    return _mm_and_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vand_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
736
#else
737
    return Vector1 & Vector2;
738 739 740
#endif
}

741
GI_FORCEINLINE
742 743 744 745 746 747 748 749 750
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vorrq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_or_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return Vector1 | Vector2;
751 752 753
#endif
}

754 755 756 757 758 759 760 761 762 763 764 765 766
GI_FORCEINLINE
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vandq_s32(vmvnq_s32(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_si128(VectorNot, Vector);
#elif defined(GI_RVV_INTRINSICS)
    GI_INT32_t not_v = vnot_v_i32m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int32_t));
    return vand_vv_i32m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return (~VectorNot) & Vector;
#endif
}
767

768 769 770 771 772 773 774 775 776 777 778 779
GI_FORCEINLINE
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return veorq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_xor_si128(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
    return vxor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t));
#else
    return Vector1 ^ Vector2;
#endif
}
780
// vim: syntax=cpp.doxygen