gi_float.h 23.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/**
 * \file dnn/src/fallback/general_intrinsic/gi_float.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#pragma once

#include "gi_common.h"

GI_FORCEINLINE
17
GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) {
18 19 20 21 22
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_s32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(In);
#else
23
    return *(GI_INT32_t*)(&In);
24 25 26 27
#endif
}

GI_FORCEINLINE
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_u32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(In);
#else
    return *(GI_UINT32_t*)(&In);
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_f32_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castsi128_ps(Vector);
#else
    return *(GI_FLOAT32_t*)(&Vector);
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_f32_u32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castsi128_ps(Vector);
#else
    return *(GI_FLOAT32_t*)(&Vector);
#endif
}

GI_FORCEINLINE
GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) {
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
    return vcvtaq_s32_f32(Vector);
#else
    float32x4_t vzero = vdupq_n_f32(0.f);
    float32x4_t vfhalf = vdupq_n_f32(0.5f);
    float32x4_t vfneg_half = vdupq_n_f32(-0.5f);
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(Vector, vzero), vfhalf, vfneg_half);
    return vcvtq_s32_f32(vaddq_f32(Vector, vinc0));
#endif
#elif defined(GI_SSE2_INTRINSICS)
    __m128 vfzero = _mm_set1_ps(0.f);
    __m128 vfhalf = _mm_set1_ps(0.5f);
    __m128 vfneg_half = _mm_set1_ps(-0.5f);
    __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(Vector, vfzero));
    __m128 vres0 = _mm_add_ps(Vector, vinc0);
    return _mm_castps_si128(
            _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
#else
81
    GI_INT32_t ret;
82 83 84 85 86 87 88 89
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = (int32_t)round(Vector[i]);
    }
    return ret;
#endif
}

GI_FORCEINLINE
90
GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) {
91
#if defined(GI_NEON_INTRINSICS)
92
    return vcvtq_s32_f32(Vector);
93
#elif defined(GI_SSE2_INTRINSICS)
94
    return _mm_cvttps_epi32(Vector);
95
#else
96 97 98
    GI_INT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = (int32_t)(Vector[i]);
99 100 101 102 103 104
    }
    return ret;
#endif
}

GI_FORCEINLINE
105
GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) {
106
#if defined(GI_NEON_INTRINSICS)
107
    return vcvtq_f32_s32(Vector);
108
#elif defined(GI_SSE2_INTRINSICS)
109
    return _mm_cvtepi32_ps(Vector);
110
#else
111 112 113 114 115
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
        ret[i] = float(Vector[i]);
    }
    return ret;
116 117 118 119
#endif
}

GI_FORCEINLINE
120
GI_FLOAT32_t GiBroadcastFloat32(float Value) {
121 122 123 124 125
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_f32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_set1_ps(Value);
#else
126
    GI_FLOAT32_t ret;
127 128 129 130 131 132 133 134
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
135
GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) {
136 137 138 139 140
#if defined(GI_NEON_INTRINSICS)
    return vld1q_dup_f32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_load_ps1(Value);
#else
141
    GI_FLOAT32_t ret;
142 143 144 145 146 147 148 149
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = *Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
150
GI_FLOAT32_t GiZeroFloat32(void) {
151 152 153 154 155 156 157 158 159 160
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_f32(0.0f);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_setzero_ps();
#else
    return GiBroadcastFloat32(0.0f);
#endif
}

GI_FORCEINLINE
161
GI_FLOAT32_t GiLoadFloat32(const float* Buffer) {
162 163 164 165 166
#if defined(GI_NEON_INTRINSICS)
    return vld1q_f32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_loadu_ps(Buffer);
#else
167
    GI_FLOAT32_t ret;
168 169 170 171 172 173 174 175
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Buffer[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
176
void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) {
177 178 179 180 181 182 183 184 185 186 187 188
#if defined(GI_NEON_INTRINSICS)
    vst1q_f32(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_ps(Buffer, Vector);
#else
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        Buffer[i] = Vector[i];
    }
#endif
}

#if defined(GI_NEON_INTRINSICS)
189 190 191
#define GISTORELANEFLOAT32(i)                                                         \
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
        vst1q_lane_f32(Buffer, Vector, i);                                            \
192 193 194 195 196
    }

#elif defined(GI_SSE2_INTRINSICS)

#define GISTORELANEFLOAT32(i)                                                          \
197
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) {  \
198 199 200
        _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
    }
#else
201 202 203
#define GISTORELANEFLOAT32(i)                                                         \
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
        *Buffer = Vector[i];                                                          \
204 205 206 207 208 209 210 211 212 213 214
    }
#endif

GISTORELANEFLOAT32(0)
GISTORELANEFLOAT32(1)
GISTORELANEFLOAT32(2)
GISTORELANEFLOAT32(3)

#undef GISTORELANEFLOAT32

#if defined(GI_NEON_INTRINSICS)
215 216 217
#define GIEXTRACTLANEFLOAT32(i)                                           \
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
        return vgetq_lane_f32(Vector, i);                                 \
218 219 220 221
    }
#elif defined(GI_SSE2_INTRINSICS)

#define GIEXTRACTLANEFLOAT32(i)                                                        \
222
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) {              \
223 224 225
        return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
    }
#else
226 227 228
#define GIEXTRACTLANEFLOAT32(i)                                           \
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
        return Vector[i];                                                 \
229 230 231 232 233 234 235 236 237 238
    }
#endif

GIEXTRACTLANEFLOAT32(0)
GIEXTRACTLANEFLOAT32(1)
GIEXTRACTLANEFLOAT32(2)
GIEXTRACTLANEFLOAT32(3)
#undef GIEXTRACTLANEFLOAT32

GI_FORCEINLINE
239
GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
240 241 242
#if defined(GI_NEON64_INTRINSICS)
    return vzip1q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
243
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
244 245 246 247
    return zipped.val[0];
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpacklo_ps(Vector1, Vector2);
#else
248
    GI_FLOAT32_t ret;
249 250 251 252 253 254 255 256 257
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
        ret[2 * i] = Vector1[i];
        ret[2 * i + 1] = Vector2[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
258
GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
259 260 261
#if defined(GI_NEON64_INTRINSICS)
    return vzip2q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
262
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
263 264 265 266
    return zipped.val[1];
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpackhi_ps(Vector1, Vector2);
#else
267
    GI_FLOAT32_t ret;
268 269 270 271 272 273 274 275 276
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
        ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i];
        ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
277
GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
278 279 280 281 282 283 284 285 286 287
#if defined(GI_NEON_INTRINSICS)
    return vaddq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_ps(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
288
GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
289 290 291 292 293 294 295 296 297 298
#if defined(GI_NEON_INTRINSICS)
    return vsubq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_ps(Vector1, Vector2);
#else
    return Vector1 - Vector2;
#endif
}

GI_FORCEINLINE
299
GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
300 301 302 303 304 305 306 307 308 309
#if defined(GI_NEON_INTRINSICS)
    return vmulq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_mul_ps(Vector1, Vector2);
#else
    return Vector1 * Vector2;
#endif
}

GI_FORCEINLINE
310
GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) {
311 312 313
#if defined(GI_NEON_INTRINSICS)
    return vmulq_n_f32(Vector1, Scaler);
#elif defined(GI_SSE2_INTRINSICS)
314
    GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler);
315 316 317 318 319 320 321
    return _mm_mul_ps(Vector1, Vector2);
#else
    return Vector1 * Scaler;
#endif
}

GI_FORCEINLINE
322 323
GI_FLOAT32_t GiMultiplyAddFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
324
#if defined(GI_NEON_INTRINSICS)
325 326 327
#if defined(__ARM_FEATURE_FMA)
    return vfmaq_f32(VectorSum, Vector1, Vector2);
#else
328
    return vmlaq_f32(VectorSum, Vector1, Vector2);
329
#endif
330 331 332 333 334 335 336 337 338 339
#elif defined(GI_FMA3_INTRINSICS)
    return _mm_fmadd_ps(Vector1, Vector2, VectorSum);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum);
#else
    return Vector1 * Vector2 + VectorSum;
#endif
}

GI_FORCEINLINE
340 341
GI_FLOAT32_t GiMultiplySubFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
342
#if defined(GI_NEON_INTRINSICS)
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
    return vmlsq_f32(VectorSum, Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2));
#else
    return VectorSum - Vector1 * Vector2;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMultiplyAddScalarFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) {
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
    return vfmaq_n_f32(VectorSum, Vector, Scalar);
#else
    return vfmla_n_f32(VectorSum, Vector, Scalar);
#endif
360
#elif defined(GI_SSE2_INTRINSICS)
361
    return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
362 363 364 365 366 367
#else
    return VectorSum + Vector * Scalar;
#endif
}

#if defined(GI_NEON_INTRINSICS)
368 369 370 371 372
#if defined(__ARM_FEATURE_FMA)
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i);      \
373 374 375 376 377
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
    }
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#else
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i);      \
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
396 397 398 399
        return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
    }
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
400
#endif
401 402 403
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)

404 405 406 407 408
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return GiMultiplyAddScalarFloat32(                                        \
                VectorSum, Vector1, GiExtractLane##i##Float32(Vector2));          \
409 410 411 412 413 414 415
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#undef GIMULTIPLYADDLANFLOAT32
#else
416 417 418 419
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return VectorSum + Vector1 * Vector2[i];                                  \
420 421 422 423 424 425 426 427 428
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#undef GIMULTIPLYADDLANFLOAT32
#endif

GI_FORCEINLINE
429
GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
430 431 432 433 434 435 436 437 438 439 440 441 442 443
#if defined(GI_NEON64_INTRINSICS)
    return vdivq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    float32x4_t recp = vrecpeq_f32(Vector2);
    recp = vmulq_f32(vrecpsq_f32(Vector2, recp), recp);
    return vmulq_f32(Vector1, recp);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_div_ps(Vector1, Vector2);
#else
    return Vector1 / Vector2;
#endif
}

GI_FORCEINLINE
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
    return vrecpsq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t two = _mm_set1_ps(2.0f);
    return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2));
#else
    return (2.0f - Vector1 * Vector2);
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vrecpeq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t ones = _mm_set1_ps(1.0f);
    return _mm_div_ps(ones, Vector);
#else
    return 1 / Vector;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vnegq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t zero = _mm_set1_ps(0.0f);
    return _mm_sub_ps(zero, Vector);
#else
    return -Vector;
#endif
}

GI_FORCEINLINE
GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vcgtq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2));
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
496
#if defined(GI_NEON_INTRINSICS)
497
    return vcleq_f32(Vector1, Vector2);
498
#elif defined(GI_SSE2_INTRINSICS)
499
    return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2));
500
#else
501 502 503 504 505
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
506 507 508 509
#endif
}

GI_FORCEINLINE
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vcltq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2));
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
526 527 528
#if defined(GI_SSE2_INTRINSICS)
    return _mm_and_ps(Vector1, Vector2);
#else
529
    return GiReintInt32ToFloat32(
530 531 532 533 534
            GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
535
GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
536 537 538
#if defined(GI_SSE2_INTRINSICS)
    return _mm_or_ps(Vector1, Vector2);
#else
539
    return GiReintInt32ToFloat32(
540 541 542 543 544
            GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
545
GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) {
546 547 548
#if defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_ps(VectorNot, Vector);
#else
549
    return GiReintInt32ToFloat32(GiAndNotInt32(
550 551 552 553 554
            GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector)));
#endif
}

GI_FORCEINLINE
555
GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
556 557 558
#if defined(GI_SSE2_INTRINSICS)
    return _mm_xor_ps(Vector1, Vector2);
#else
559
    return GiReintInt32ToFloat32(
560 561 562 563 564
            GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
565 566
GI_FLOAT32_t GiBlendFloat32(
        GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) {
567 568 569 570
    return GiOrFloat32(
            GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1));
}

571 572 573
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);

574
GI_FORCEINLINE
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
GI_FLOAT32_t GiBSLFloat32(
        GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vbslq_f32(Selection, Vector1, Vector2);
#else
    return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection));
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    return _mm_max_ps(Vector1, Vector2);
#else
    GI_FLOAT32_t max;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = Max(Vector1[i], Vector2[i]);
    }
    return max;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    return _mm_min_ps(Vector1, Vector2);
#else
    GI_FLOAT32_t min;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = Min(Vector1[i], Vector2[i]);
    }
    return min;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
616 617
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
618
#else
619 620
    //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
621 622
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
    GI_FLOAT32_t max;
623 624 625 626
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = MAX_NAN(Vector1[i], Vector2[i]);
    }
    return max;
627 628 629 630
#endif
}

GI_FORCEINLINE
631
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
632 633
#if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
634
#else
635 636
    //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
637 638
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
    GI_FLOAT32_t min;
639 640 641 642
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = MIN_NAN(Vector1[i], Vector2[i]);
    }
    return min;
643 644 645 646
#endif
}

GI_FORCEINLINE
647
GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) {
648 649 650 651 652 653
    Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value);
    Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value);
    return Value;
}

GI_FORCEINLINE
654
float GiReduceAddFloat32(GI_FLOAT32_t Vector) {
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
#if defined(GI_NEON64_INTRINSICS)
    Vector = vpaddq_f32(Vector, Vector);
    Vector = vpaddq_f32(Vector, Vector);
    return vgetq_lane_f32(Vector, 0);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
    Vector = GiAddFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
    Vector = GiAddFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = 0;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret += Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
681
float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) {
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
#if defined(GI_NEON64_INTRINSICS)
    float32x2_t low = vget_low_f32(Vector);
    float32x2_t high = vget_high_f32(Vector);
    float32x2_t res = vmul_f32(low, high);
    return vget_lane_f32(res, 0) * vget_lane_f32(res, 1);
#elif defined(GI_SSE2_INTRINSICS)
    Vector = GiMultiplyFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
    Vector = GiMultiplyFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = 1;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret *= Vector[i];
    }
    return ret;
#endif
}

#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)

GI_FORCEINLINE
706
float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) {
707 708 709 710 711 712 713 714 715
#if defined(GI_NEON64_INTRINSICS)
    return vmaxvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
716
    Vector = GiMaxNanFloat32(
717
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
718
    Vector = GiMaxNanFloat32(
719 720 721 722 723
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
724
        ret = MAX_NAN(ret, Vector[i]);
725 726 727 728 729 730
    }
    return ret;
#endif
}

GI_FORCEINLINE
731
float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) {
732 733 734 735 736 737 738 739 740
#if defined(GI_NEON64_INTRINSICS)
    return vminvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpmin_f32(VectorLow, VectorHigh);
    VectorLow = vpmin_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
741
    Vector = GiMinNanFloat32(
742
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
743
    Vector = GiMinNanFloat32(
744 745 746 747 748
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
749
        ret = MIN_NAN(ret, Vector[i]);
750 751 752 753 754
    }
    return ret;
#endif
}

755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
GI_FORCEINLINE
GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) {
#if defined(GI_NEON64_INTRINSICS)
    return vabsq_f32(Vector1);
#elif defined(GI_SSE2_INTRINSICS)
    union {
        unsigned int int_val;
        float float_val;
    } value;
    value.int_val = 0x7fffffff;
    return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val));
#else
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i];
    }
    return ret;
#endif
}

775
// vim: syntax=cpp.doxygen