gi_int.h 30.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/**
 * \file dnn/src/fallback/general_intrinsic/gi_float.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#pragma once

#include "gi_common.h"

GI_FORCEINLINE
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
GI_UINT32_t GiBroadcastUint32(int32_t Value) {
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_u32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_set1_epi32(Value);
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
32
GI_INT32_t GiLoadInt32(const void* Buffer) {
33
#if defined(GI_NEON_INTRINSICS)
34
    return vld1q_s32((int32_t*)Buffer);
35
#elif defined(GI_SSE2_INTRINSICS)
36
    return _mm_loadu_si128((const __m128i*)Buffer);
37
#else
38 39 40 41
    GI_INT32_t ret;
    const int32_t* ptr = (int32_t*)Buffer;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = ptr[i];
42 43 44 45 46 47
    }
    return ret;
#endif
}

GI_FORCEINLINE
48
GI_INT16_t GiLoadInt16(const void* Buffer) {
49
#if defined(GI_NEON_INTRINSICS)
50
    return vld1q_s16((int16_t*)Buffer);
51 52 53
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_loadu_si128((const __m128i*)Buffer);
#else
54 55 56
    GI_INT16_t ret;
    const int16_t* ptr = (int16_t*)Buffer;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) {
57
        ret[i] = ptr[i];
58 59 60 61 62 63
    }
    return ret;
#endif
}

GI_FORCEINLINE
64
GI_INT8_t GiLoadInt8(const void* Buffer) {
65
#if defined(GI_NEON_INTRINSICS)
66
    return vld1q_s8((int8_t*)Buffer);
67 68 69
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_loadu_si128((const __m128i*)Buffer);
#else
70
    GI_INT8_t ret;
71
    const int8_t* ptr = (int8_t*)Buffer;
72
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
73
        ret[i] = ptr[i];
74 75 76 77 78 79
    }
    return ret;
#endif
}

GI_FORCEINLINE
80
void GiStoreInt32(void* Buffer, GI_INT32_t Vector) {
81
#if defined(GI_NEON_INTRINSICS)
82
    vst1q_s32((int32_t*)Buffer, Vector);
83 84 85
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_si128((__m128i*)Buffer, Vector);
#else
86
    int32_t* ptr = (int32_t*)Buffer;
87
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
88
        ptr[i] = Vector[i];
89 90 91 92
    }
#endif
}

93
#if defined(GI_NEON_INTRINSICS)
94 95 96
#define GISTORELANEINT32(i)                                                      \
    GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \
        vst1q_lane_s32((int32_t*)Buffer, Vector, i);                             \
97 98 99 100 101
    }

#elif defined(GI_SSE2_INTRINSICS)

#define GISTORELANEINT32(i)                                                         \
102
    GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) {    \
103 104 105 106 107
        GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector);                                \
        _mm_store_ss(                                                               \
                (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \
    }
#else
108 109 110
#define GISTORELANEINT32(i)                                                      \
    GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \
        *((int32_t*)Buffer) = Vector[i];                                         \
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    }
#endif

GISTORELANEINT32(0)
GISTORELANEINT32(1)
GISTORELANEINT32(2)
GISTORELANEINT32(3)

#undef GISTORELANEFLOAT32

GI_FORCEINLINE
GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_s8_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return Vector;
#else
    return *(GI_INT8_t*)&Vector;
#endif
}

GI_FORCEINLINE
133
void GiStoreInt16(void* Buffer, GI_INT16_t Vector) {
134
#if defined(GI_NEON_INTRINSICS)
135
    vst1q_s16((int16_t*)Buffer, Vector);
136 137 138
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_si128((__m128i*)Buffer, Vector);
#else
139
    int16_t* ptr = (int16_t*)Buffer;
140
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) {
141
        ptr[i] = Vector[i];
142 143 144 145
    }
#endif
}

146
GI_FORCEINLINE
147
void GiStoreInt8(void* Buffer, GI_INT8_t Vector) {
148
#if defined(GI_NEON_INTRINSICS)
149
    vst1q_s8((int8_t*)Buffer, Vector);
150 151 152
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_si128((__m128i*)Buffer, Vector);
#else
153
    int8_t* ptr = (int8_t*)Buffer;
154
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
155
        ptr[i] = Vector[i];
156 157 158 159 160
    }
#endif
}

GI_FORCEINLINE
161
void GiStoreLowInt8(void* Buffer, GI_INT8_t Vector) {
162
#if defined(GI_NEON_INTRINSICS)
163
    vst1_s8((int8_t*)Buffer, vget_low_s8(Vector));
164 165 166
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storel_epi64((__m128i*)Buffer, Vector);
#else
167
    int8_t* ptr = (int8_t*)Buffer;
168
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
169
        ptr[i] = Vector[i];
170 171 172 173 174
    }
#endif
}

GI_FORCEINLINE
175
void GiStoreHihgInt8(void* Buffer, GI_INT8_t Vector) {
176
#if defined(GI_NEON_INTRINSICS)
177
    vst1_s8((int8_t*)Buffer, vget_high_s8(Vector));
178 179 180
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storel_epi64((__m128i*)Buffer, _mm_unpackhi_epi64(Vector, Vector));
#else
181
    int8_t* ptr = (int8_t*)Buffer;
182
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
183
        ptr[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
184 185 186 187 188
    }
#endif
}

GI_FORCEINLINE
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
GI_INT32_t GiNegInt32(GI_INT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vnegq_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_INT32_t zero = _mm_set1_epi32(0);
    return _mm_sub_epi32(zero, Vector);
#else
    return -Vector;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiNegInt8(GI_INT8_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vnegq_s8(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_INT32_t zero = _mm_set1_epi8(0);
    return _mm_sub_epi8(zero, Vector);
#else
    return -Vector;
#endif
}

GI_FORCEINLINE
GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vtstq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2);
    return _mm_cmpeq_epi32(tmp, _mm_setzero_si128());
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
230 231 232 233 234 235 236 237 238 239
#if defined(GI_NEON_INTRINSICS)
    return vaddq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi32(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vaddq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi32(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vaddq_s16(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi16(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vaddq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi8(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
274 275 276 277 278 279 280 281 282 283
#if defined(GI_NEON_INTRINSICS)
    return vsubq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_epi32(Vector1, Vector2);
#else
    return Vector1 - Vector2;
#endif
}

GI_FORCEINLINE
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vsubq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_epi32(Vector1, Vector2);
#else
    return Vector1 - Vector2;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vsubq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_epi8(Vector1, Vector2);
#else
    return Vector1 - Vector2;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
307 308 309
#if defined(GI_NEON_INTRINSICS)
    return vmulq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1);
    GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2);
    return _mm_cvttps_epi32(_mm_mul_ps(v0, v1));
#else
    return Vector1 * Vector2;
#endif
}
//! in x86, there is no int multiply, so implement it naive
GI_FORCEINLINE
GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vmulq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    int8_t v1[16], v2[16], res[16];
    _mm_storeu_si128((__m128i*)v1, Vector1);
    _mm_storeu_si128((__m128i*)v2, Vector2);
    for (size_t id = 0; id < 16; id++) {
        res[id] = v1[id] * v2[id];
    }
    return _mm_loadu_si128((__m128i*)res);
330 331 332 333 334 335
#else
    return Vector1 * Vector2;
#endif
}

GI_FORCEINLINE
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
GI_INT32_t GiMultiplyAddInt32(
        GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) {
#if defined(GI_NEON_INTRINSICS)
    return vmlaq_s32(Vector1, Vector2, Vector3);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3));
#else
    return Vector1 + Vector2 * Vector3;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) {
#if defined(GI_NEON_INTRINSICS)
    return vmlaq_s8(Vector1, Vector2, Vector3);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3));
#else
    return Vector1 + Vector2 * Vector3;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
360 361 362 363 364 365 366 367 368 369
#if defined(GI_NEON_INTRINSICS)
    return vandq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_and_si128(Vector1, Vector2);
#else
    return Vector1 & Vector2;
#endif
}

GI_FORCEINLINE
370 371 372 373 374 375 376 377 378 379 380 381
GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return veorq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_xor_si128(Vector1, Vector2);
#else
    return Vector1 ^ Vector2;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
382 383 384 385 386 387 388 389 390 391
#if defined(GI_NEON_INTRINSICS)
    return vorrq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_or_si128(Vector1, Vector2);
#else
    return Vector1 | Vector2;
#endif
}

GI_FORCEINLINE
392
GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) {
393 394 395 396 397
#if defined(GI_NEON_INTRINSICS)
    return vandq_s8(vmvnq_s8(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_si128(VectorNot, Vector);
#else
398
    GI_INT8_t Not = ~VectorNot;
399
    return (Not & Vector);
400 401 402 403
#endif
}

GI_FORCEINLINE
404
GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
405 406 407 408 409 410 411 412 413
#if defined(GI_NEON_INTRINSICS)
    return veorq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_xor_si128(Vector1, Vector2);
#else
    return Vector1 ^ Vector2;
#endif
}

414 415
GI_FORCEINLINE
GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) {
416
#if defined(GI_NEON_INTRINSICS)
417
    return vshlq_n_s32(Vector, 23);
418
#elif defined(GI_SSE2_INTRINSICS)
419
    return _mm_slli_epi32(Vector, 23);
420
#else
421
    return Vector << 23;
422
#endif
423
}
424

425 426 427 428 429 430 431 432 433 434
GI_FORCEINLINE
GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vshrq_n_s32(Vector, 23);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_srai_epi32(Vector, 23);
#else
    return Vector >> 23;
#endif
}
435 436

GI_FORCEINLINE
437
GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) {
438 439 440 441
    return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1));
}

GI_FORCEINLINE
442
GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) {
443 444 445 446
    return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1));
}

GI_FORCEINLINE
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
GI_INT32_t GiAbsInt32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vabsq_s32(Vector);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_abs_epi32(Vector);
#else
    GI_INT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT16_t GiAbsInt16(GI_INT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vabsq_s16(Vector);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_abs_epi16(Vector);
#else
    GI_INT16_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) {
        ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT8_t GiAbsInt8(GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vabsq_s8(Vector);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_abs_epi8(Vector);
#else
    GI_INT8_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
        ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
493 494 495 496 497 498 499 500 501 502 503 504
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_s32(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_max_epi32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2));
#else
    return GiBlendInt32(Vector2, Vector1, Vector1 > Vector2);
#endif
}

GI_FORCEINLINE
505
GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
506 507 508 509 510 511 512 513 514 515 516 517
#if defined(GI_NEON_INTRINSICS)
    return vminq_s32(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_min_epi32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1));
#else
    return GiBlendInt32(Vector2, Vector1, Vector2 > Vector1);
#endif
}

GI_FORCEINLINE
518
GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) {
519 520 521 522
    return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1));
}

GI_FORCEINLINE
523
GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
524 525 526 527 528 529 530 531 532 533 534 535
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_s8(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_max_epi8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector1, Vector2));
#else
    return GiBlendInt8(Vector2, Vector1, Vector1 > Vector2);
#endif
}

GI_FORCEINLINE
536
GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
537 538 539 540 541 542 543 544 545 546 547 548
#if defined(GI_NEON_INTRINSICS)
    return vminq_s8(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_min_epi8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector2, Vector1));
#else
    return GiBlendInt8(Vector2, Vector1, Vector2 > Vector1);
#endif
}

GI_FORCEINLINE
549
GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) {
550 551 552 553 554 555 556 557 558 559 560
#if defined(GI_NEON_INTRINSICS)
    return vmovl_s8(vget_high_s8(Vector));
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_cvtepi8_epi16(_mm_unpackhi_epi64(Vector, Vector));
#elif defined(GI_SSE2_INTRINSICS)
    int16_t data[8];
    int8_t o_data[16];
    _mm_storeu_si128((__m128i*)o_data, Vector);
    for (int i = 0; i < 8; i++) {
        data[i] = o_data[8 + i];
    }
561
    return _mm_loadu_si128((__m128i*)data);
562
#else
563
    GI_INT16_t ret;
564 565 566 567
    int8_t* data = (int8_t*)&Vector;
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = data[i + half_length];
568 569 570 571 572 573
    }
    return ret;
#endif
}

GI_FORCEINLINE
574
GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) {
575 576 577 578 579 580 581 582 583 584 585
#if defined(GI_NEON_INTRINSICS)
    return vmovl_s8(vget_low_s8(Vector));
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_cvtepi8_epi16(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    int16_t data[8];
    int8_t o_data[16];
    _mm_storeu_si128((__m128i*)o_data, Vector);
    for (int i = 0; i < 8; i++) {
        data[i] = o_data[i];
    }
586
    return _mm_loadu_si128((__m128i*)data);
587
#else
588
    GI_INT16_t ret;
589 590
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
    for (size_t i = 0; i < half_length; i++) {
591 592 593 594 595 596 597
        ret[i] = Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
598
GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) {
599 600 601 602 603 604 605 606 607 608 609
#if defined(GI_NEON_INTRINSICS)
    return vmovl_s16(vget_high_s16(Vector));
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_cvtepi16_epi32(_mm_unpackhi_epi64(Vector, Vector));
#elif defined(GI_SSE2_INTRINSICS)
    int32_t data[4];
    int16_t o_data[8];
    _mm_storeu_si128((__m128i*)o_data, Vector);
    for (int i = 0; i < 4; i++) {
        data[i] = o_data[4 + i];
    }
610
    return _mm_loadu_si128((__m128i*)data);
611
#else
612
    GI_INT32_t ret;
613 614 615
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
    for (size_t i = 0; i < half_length; i++) {
        ret[i] = Vector[half_length + i];
616 617 618 619 620 621
    }
    return ret;
#endif
}

GI_FORCEINLINE
622
GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) {
623 624 625 626 627 628 629 630 631 632 633
#if defined(GI_NEON_INTRINSICS)
    return vmovl_s16(vget_low_s16(Vector));
#elif defined(GI_SSE42_INTRINSICS)
    return _mm_cvtepi16_epi32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    int32_t data[4];
    int16_t o_data[8];
    _mm_storeu_si128((__m128i*)o_data, Vector);
    for (int i = 0; i < 4; i++) {
        data[i] = o_data[i];
    }
634
    return _mm_loadu_si128((__m128i*)data);
635
#else
636
    GI_INT32_t ret;
637 638
    size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
    for (size_t i = 0; i < half_length; i++) {
639 640 641 642 643 644 645
        ret[i] = Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
646
int32_t GiReduceAddInt8(GI_INT8_t Vector) {
647 648 649
#if defined(GI_NEON64_INTRINSICS)
    return vaddlvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
650
    int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
    return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) +
            vgetq_lane_s32(sum, 3));
#elif defined(GI_SSE42_INTRINSICS)
    __m128i v0 = _mm_cvtepi8_epi16(Vector);
    __m128i v1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(Vector, Vector));
    __m128i sum_int16 = _mm_add_epi16(v0, v1);
    __m128i v0_int32 = _mm_cvtepi16_epi32(sum_int16);
    __m128i v1_int32 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(sum_int16, sum_int16));
    __m128i sum = _mm_add_epi32(v0_int32, v1_int32);
    float ret = _mm_extract_epi32(sum, 0);
    ret += _mm_extract_epi32(sum, 1);
    ret += _mm_extract_epi32(sum, 2);
    ret += _mm_extract_epi32(sum, 3);
    return (int16_t)(ret);

#elif defined(GI_SSE2_INTRINSICS)
667 668
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
669 670 671 672 673 674 675 676 677 678 679 680 681 682
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
    __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
    __m128 sum0 = _mm_add_ps(v0, v1);
    __m128 sum1 = _mm_add_ps(v2, v3);
    __m128 sum = _mm_add_ps(sum0, sum1);
    float ret0 = _mm_cvtss_f32(sum);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
    return (int16_t)(ret0 + ret1 + ret2 + ret3);
#else
    int32_t sum = 0;
683
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
684 685 686 687 688 689 690
        sum += Vector[i];
    }
    return sum;
#endif
}

GI_FORCEINLINE
691
int8_t GiReduceMaxInt8(GI_INT8_t Vector) {
692 693 694 695 696
#if defined(GI_NEON64_INTRINSICS)
    return vmaxvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    int8x8_t VectorLow = vget_low_s8(Vector);
    int8x8_t VectorHigh = vget_high_s8(Vector);
697 698 699 700
    VectorLow = vpmax_s8(VectorLow, VectorHigh);
    VectorLow = vpmax_s8(VectorLow, VectorLow);
    VectorLow = vpmax_s8(VectorLow, VectorLow);
    VectorLow = vpmax_s8(VectorLow, VectorLow);
701 702 703 704 705 706 707 708 709 710 711 712 713 714
    return vget_lane_s8(VectorLow, 0);
#elif defined(GI_SSE42_INTRINSICS)
    __m128i v0 = _mm_cvtepi8_epi16(Vector);
    __m128i v1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(Vector, Vector));
    __m128i max_int16 = _mm_max_epi16(v0, v1);
    __m128i v0_int32 = _mm_cvtepi16_epi32(max_int16);
    __m128i v1_int32 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(max_int16, max_int16));
    __m128i sum = _mm_max_epi32(v0_int32, v1_int32);
    int ret = _mm_extract_epi32(sum, 0);
    ret = Max(_mm_extract_epi32(sum, 1), ret);
    ret = Max(_mm_extract_epi32(sum, 2), ret);
    ret = Max(_mm_extract_epi32(sum, 3), ret);
    return (int8_t)ret;
#elif defined(GI_SSE2_INTRINSICS)
715 716
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
717 718 719 720
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
    __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
721 722 723 724 725 726 727
    __m128 max0 = _mm_max_ps(v0, v1);
    __m128 max1 = _mm_max_ps(v2, v3);
    __m128 max = _mm_max_ps(max0, max1);
    float ret0 = _mm_cvtss_f32(max);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3)));
728 729 730
    return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3)));
#else
    int8_t max = Vector[0];
731
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
732 733 734 735 736 737 738
        max = Max(max, Vector[i]);
    }
    return max;
#endif
}

GI_FORCEINLINE
739
int8_t GiReduceMinInt8(GI_INT8_t Vector) {
740 741 742 743 744 745
#if defined(GI_NEON64_INTRINSICS)
    return vminvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    int8x8_t VectorLow = vget_low_s8(Vector);
    int8x8_t VectorHigh = vget_high_s8(Vector);
    VectorLow = vpmin_s8(VectorLow, VectorHigh);
746 747 748
    VectorLow = vpmin_s8(VectorLow, VectorLow);
    VectorLow = vpmin_s8(VectorLow, VectorLow);
    VectorLow = vpmin_s8(VectorLow, VectorLow);
749 750 751 752 753 754 755 756 757 758 759 760 761 762
    return vget_lane_s8(VectorLow, 0);
#elif defined(GI_SSE42_INTRINSICS)
    __m128i v0 = _mm_cvtepi8_epi16(Vector);
    __m128i v1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(Vector, Vector));
    __m128i min_int16 = _mm_min_epi16(v0, v1);
    __m128i v0_int32 = _mm_cvtepi16_epi32(min_int16);
    __m128i v1_int32 = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(min_int16, min_int16));
    __m128i sum = _mm_min_epi32(v0_int32, v1_int32);
    int ret = _mm_extract_epi32(sum, 0);
    ret = Min(_mm_extract_epi32(sum, 1), ret);
    ret = Min(_mm_extract_epi32(sum, 2), ret);
    ret = Min(_mm_extract_epi32(sum, 3), ret);
    return (int8_t)ret;
#elif defined(GI_SSE2_INTRINSICS)
763 764
    __m64 low = _mm_movepi64_pi64(Vector);
    __m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
765 766 767 768
    __m128 v0 = _mm_cvtpi8_ps(low);
    __m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
    __m128 v2 = _mm_cvtpi8_ps(high);
    __m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
769 770 771 772 773 774 775
    __m128 min0 = _mm_min_ps(v0, v1);
    __m128 min1 = _mm_min_ps(v2, v3);
    __m128 min = _mm_min_ps(min0, min1);
    float ret0 = _mm_cvtss_f32(min);
    float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1)));
    float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2)));
    float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3)));
776 777 778
    return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3)));
#else
    int8_t min = Vector[0];
779
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
780 781 782 783 784 785 786 787 788 789 790 791
        min = Min(min, Vector[i]);
    }
    return min;
#endif
}

#define Saturate(x, lower, upper) \
    (x) > (upper) ? (upper) : ((x) >= (lower) ? (x) : (lower))

//! convert to the short type with the lower bit fill the real data, the high bite
//! will repeat the lower bit
GI_FORCEINLINE
792
GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) {
793 794 795 796
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
    int32x4_t vres0 = vcvtaq_s32_f32(src);
    int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
797
    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
798
#else
799
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vfzero), vfhalf, vfneg_half);
800 801
    int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0));
    int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
802
    return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
803 804 805 806 807 808 809 810 811 812 813 814
#endif
#elif defined(GI_SSE42_INTRINSICS)
    __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(src, vfzero));
    __m128 vres0 = _mm_add_ps(src, vinc0);
    vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8);

    __m128i vepi32_0 = _mm_cvtps_epi32(vres0);
    __m128i vepi16 = _mm_packs_epi32(vepi32_0, vepi32_0);
    __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16);
    return vepi8;
#else
815
    GI_INT8_t ret;
816 817 818 819 820 821 822 823 824 825 826 827 828
    int length = GI_SIMD_LEN_BYTE / sizeof(float);
    for (int i = 0; i < length; i++) {
        int8_t data = Saturate(round(src[i]), -128, 127);
        ret[i] = data;
        ret[length + i] = data;
        ret[2 * length + i] = data;
        ret[3 * length + i] = data;
    }
    return ret;
#endif
}

GI_FORCEINLINE
829
GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) {
830 831 832 833 834 835 836
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
    int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]);
    int32x4_t vres1 = vcvtaq_s32_f32(vsrc.val[1]);
    int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1)));
    return vcombine_s8(mid1, mid1);
#else
837 838
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(vsrc.val[0], vfzero), vfhalf, vfneg_half);
    float32x4_t vinc1 = vbslq_f32(vcgeq_f32(vsrc.val[1], vfzero), vfhalf, vfneg_half);
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
    int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(vsrc.val[0], vinc0));
    int32x4_t vres1 = vcvtq_s32_f32(vaddq_f32(vsrc.val[1], vinc1));
    int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1)));
    return vcombine_s8(mid1, mid1);
#endif
#elif defined(GI_SSE42_INTRINSICS)
    __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero));
    __m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero));

    __m128 vres0 = _mm_add_ps(vsrc.val[0], vinc0);
    __m128 vres1 = _mm_add_ps(vsrc.val[1], vinc1);

    vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    vres1 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);

    vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8);
    vres1 = _mm_min_ps(_mm_max_ps(vres1, vfmin_int8), vfmax_int8);

    __m128i vepi32_0 = _mm_cvtps_epi32(vres0);
    __m128i vepi32_1 = _mm_cvtps_epi32(vres1);
    __m128i vepi16_0 = _mm_packs_epi32(vepi32_0, vepi32_1);
    __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0);
    return vepi8;
#else
863
    GI_INT8_t ret;
864 865 866 867 868 869 870 871 872
    int length = GI_SIMD_LEN_BYTE / sizeof(float);
    for (int i = 0; i < 2 * length; i++) {
        ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127);
    }
    return ret;
#endif
}

GI_FORCEINLINE
873
GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) {
874 875 876 877 878 879 880 881 882 883
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
    int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]);
    int32x4_t vres1 = vcvtaq_s32_f32(vsrc.val[1]);
    int32x4_t vres2 = vcvtaq_s32_f32(vsrc.val[1]);
    int32x4_t vres3 = vcvtaq_s32_f32(vsrc.val[1]);
    int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1)));
    int8x8_t mid2 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres2), vqmovn_s32(vres3)));
    return vcombine_s8(mid1, mid2);
#else
884
    float32x4_t vfzero = vdupq_n_f32(0.f);
885 886
    float32x4_t vfhalf = vdupq_n_f32(0.5f);
    float32x4_t vfneg_half = vdupq_n_f32(-0.5f);
887 888 889 890
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(vsrc.val[0], vfzero), vfhalf, vfneg_half);
    float32x4_t vinc1 = vbslq_f32(vcgeq_f32(vsrc.val[1], vfzero), vfhalf, vfneg_half);
    float32x4_t vinc2 = vbslq_f32(vcgeq_f32(vsrc.val[2], vfzero), vfhalf, vfneg_half);
    float32x4_t vinc3 = vbslq_f32(vcgeq_f32(vsrc.val[3], vfzero), vfhalf, vfneg_half);
891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
    int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(vsrc.val[0], vinc0));
    int32x4_t vres1 = vcvtq_s32_f32(vaddq_f32(vsrc.val[1], vinc1));
    int32x4_t vres2 = vcvtq_s32_f32(vaddq_f32(vsrc.val[2], vinc2));
    int32x4_t vres3 = vcvtq_s32_f32(vaddq_f32(vsrc.val[3], vinc3));
    int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1)));
    int8x8_t mid2 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres2), vqmovn_s32(vres3)));
    return vcombine_s8(mid1, mid2);
#endif
#elif defined(GI_SSE42_INTRINSICS)
    __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero));
    __m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero));
    __m128 vinc2 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[2], vfzero));
    __m128 vinc3 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[3], vfzero));

    __m128 vres0 = _mm_add_ps(vsrc.val[0], vinc0);
    __m128 vres1 = _mm_add_ps(vsrc.val[1], vinc1);
    __m128 vres2 = _mm_add_ps(vsrc.val[2], vinc2);
    __m128 vres3 = _mm_add_ps(vsrc.val[3], vinc3);

    vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    vres1 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    vres2 = _mm_round_ps(vres2, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
    vres3 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);

    vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8);
    vres1 = _mm_min_ps(_mm_max_ps(vres1, vfmin_int8), vfmax_int8);
    vres2 = _mm_min_ps(_mm_max_ps(vres2, vfmin_int8), vfmax_int8);
    vres3 = _mm_min_ps(_mm_max_ps(vres3, vfmin_int8), vfmax_int8);

    __m128i vepi32_0 = _mm_cvtps_epi32(vres0);
    __m128i vepi32_1 = _mm_cvtps_epi32(vres1);
    __m128i vepi32_2 = _mm_cvtps_epi32(vres2);
    __m128i vepi32_3 = _mm_cvtps_epi32(vres3);
    __m128i vepi16_0 = _mm_packs_epi32(vepi32_0, vepi32_1);
    __m128i vepi16_1 = _mm_packs_epi32(vepi32_2, vepi32_3);
    __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1);
    return vepi8;
#else
929
    GI_INT8_t ret;
930 931 932 933 934 935 936 937 938
    int length = GI_SIMD_LEN_BYTE / sizeof(float);
    for (int i = 0; i < 4 * length; i++) {
        ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127);
    }
    return ret;
#endif
}

// vim: syntax=cpp.doxygen