gi_float.h 21.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/**
 * \file dnn/src/fallback/general_intrinsic/gi_float.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#pragma once

#include "gi_common.h"

GI_FORCEINLINE
17
GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) {
18 19 20 21 22
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_s32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(In);
#else
23
    return (GI_INT32_t)In;
24 25 26 27
#endif
}

GI_FORCEINLINE
28 29 30 31 32 33
GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_u32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(In);
#else
34
    return (GI_UINT32_t)In;
35 36 37 38 39 40 41 42 43 44
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_f32_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castsi128_ps(Vector);
#else
45
    return (GI_FLOAT32_t)In;
46 47 48 49 50 51 52 53 54 55
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
    return vreinterpretq_f32_u32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castsi128_ps(Vector);
#else
56
    return (GI_FLOAT32_t)In;
57 58 59 60 61
#endif
}

GI_FORCEINLINE
GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) {
62 63 64 65
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
    return vcvtaq_s32_f32(Vector);
#else
66
    float32x4_t vinc0 = vbslq_f32(vcgeq_f32(Vector, vfzero), vfhalf, vfneg_half);
67 68
    return vcvtq_s32_f32(vaddq_f32(Vector, vinc0));
#endif
69
#elif defined(GI_SSE42_INTRINSICS)
70
    __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(Vector, vfzero));
71
    return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0));
72
#else
73
    GI_INT32_t ret;
74 75 76 77 78 79 80 81
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = (int32_t)round(Vector[i]);
    }
    return ret;
#endif
}

GI_FORCEINLINE
82
GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) {
83
#if defined(GI_NEON_INTRINSICS)
84
    return vcvtq_s32_f32(Vector);
85
#elif defined(GI_SSE2_INTRINSICS)
86
    return _mm_cvttps_epi32(Vector);
87
#else
88 89 90
    GI_INT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = (int32_t)(Vector[i]);
91 92 93 94 95 96
    }
    return ret;
#endif
}

GI_FORCEINLINE
97
GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) {
98
#if defined(GI_NEON_INTRINSICS)
99
    return vcvtq_f32_s32(Vector);
100
#elif defined(GI_SSE2_INTRINSICS)
101
    return _mm_cvtepi32_ps(Vector);
102
#else
103 104
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
105
        ret[i] = (float)Vector[i];
106 107 108 109 110 111
    }
    return ret;
#endif
}

GI_FORCEINLINE
112
GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) {
113 114 115 116 117
#if defined(GI_NEON_INTRINSICS)
    return vld1q_dup_f32(Value);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_load_ps1(Value);
#else
118
    GI_FLOAT32_t ret;
119 120 121 122 123 124 125 126
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = *Value;
    }
    return ret;
#endif
}

GI_FORCEINLINE
127
GI_FLOAT32_t GiZeroFloat32(void) {
128 129 130 131 132 133 134 135 136 137
#if defined(GI_NEON_INTRINSICS)
    return vdupq_n_f32(0.0f);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_setzero_ps();
#else
    return GiBroadcastFloat32(0.0f);
#endif
}

GI_FORCEINLINE
138
GI_FLOAT32_t GiLoadFloat32(const float* Buffer) {
139 140 141 142 143
#if defined(GI_NEON_INTRINSICS)
    return vld1q_f32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_loadu_ps(Buffer);
#else
144
    GI_FLOAT32_t ret;
145 146 147 148 149 150 151 152
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Buffer[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
153
void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) {
154 155 156 157 158 159 160 161 162 163 164 165
#if defined(GI_NEON_INTRINSICS)
    vst1q_f32(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
    _mm_storeu_ps(Buffer, Vector);
#else
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        Buffer[i] = Vector[i];
    }
#endif
}

#if defined(GI_NEON_INTRINSICS)
166 167 168
#define GISTORELANEFLOAT32(i)                                                         \
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
        vst1q_lane_f32(Buffer, Vector, i);                                            \
169 170 171 172 173
    }

#elif defined(GI_SSE2_INTRINSICS)

#define GISTORELANEFLOAT32(i)                                                          \
174
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) {  \
175 176 177
        _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
    }
#else
178 179 180
#define GISTORELANEFLOAT32(i)                                                         \
    GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
        *Buffer = Vector[i];                                                          \
181 182 183 184 185 186 187 188 189 190 191
    }
#endif

GISTORELANEFLOAT32(0)
GISTORELANEFLOAT32(1)
GISTORELANEFLOAT32(2)
GISTORELANEFLOAT32(3)

#undef GISTORELANEFLOAT32

#if defined(GI_NEON_INTRINSICS)
192 193 194
#define GIEXTRACTLANEFLOAT32(i)                                           \
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
        return vgetq_lane_f32(Vector, i);                                 \
195 196 197 198
    }
#elif defined(GI_SSE2_INTRINSICS)

#define GIEXTRACTLANEFLOAT32(i)                                                        \
199
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) {              \
200 201 202
        return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
    }
#else
203 204 205
#define GIEXTRACTLANEFLOAT32(i)                                           \
    GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
        return Vector[i];                                                 \
206 207 208 209 210 211 212 213 214 215
    }
#endif

GIEXTRACTLANEFLOAT32(0)
GIEXTRACTLANEFLOAT32(1)
GIEXTRACTLANEFLOAT32(2)
GIEXTRACTLANEFLOAT32(3)
#undef GIEXTRACTLANEFLOAT32

GI_FORCEINLINE
216
GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
217 218 219
#if defined(GI_NEON64_INTRINSICS)
    return vzip1q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
220
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
221 222 223 224
    return zipped.val[0];
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpacklo_ps(Vector1, Vector2);
#else
225
    GI_FLOAT32_t ret;
226 227 228 229 230 231 232 233 234
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
        ret[2 * i] = Vector1[i];
        ret[2 * i + 1] = Vector2[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
235
GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
236 237 238
#if defined(GI_NEON64_INTRINSICS)
    return vzip2q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
239
    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
240 241 242 243
    return zipped.val[1];
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_unpackhi_ps(Vector1, Vector2);
#else
244
    GI_FLOAT32_t ret;
245 246 247 248 249 250 251 252 253
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
        ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i];
        ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
254
GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
255 256 257 258 259 260 261 262 263 264
#if defined(GI_NEON_INTRINSICS)
    return vaddq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_ps(Vector1, Vector2);
#else
    return Vector1 + Vector2;
#endif
}

GI_FORCEINLINE
265
GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
266 267 268 269 270 271 272 273 274 275
#if defined(GI_NEON_INTRINSICS)
    return vsubq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_ps(Vector1, Vector2);
#else
    return Vector1 - Vector2;
#endif
}

GI_FORCEINLINE
276
GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
277 278 279 280 281 282 283 284 285 286
#if defined(GI_NEON_INTRINSICS)
    return vmulq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_mul_ps(Vector1, Vector2);
#else
    return Vector1 * Vector2;
#endif
}

GI_FORCEINLINE
287
GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) {
288 289 290
#if defined(GI_NEON_INTRINSICS)
    return vmulq_n_f32(Vector1, Scaler);
#elif defined(GI_SSE2_INTRINSICS)
291
    GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler);
292 293 294 295 296 297 298
    return _mm_mul_ps(Vector1, Vector2);
#else
    return Vector1 * Scaler;
#endif
}

GI_FORCEINLINE
299 300
GI_FLOAT32_t GiMultiplyAddFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
301
#if defined(GI_NEON_INTRINSICS)
302
    return v_fma_ps_f32(VectorSum, Vector1, Vector2);
303 304 305 306 307 308 309 310 311 312
#elif defined(GI_FMA3_INTRINSICS)
    return _mm_fmadd_ps(Vector1, Vector2, VectorSum);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum);
#else
    return Vector1 * Vector2 + VectorSum;
#endif
}

GI_FORCEINLINE
313 314
GI_FLOAT32_t GiMultiplySubFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
315
#if defined(GI_NEON_INTRINSICS)
316 317 318 319 320 321 322 323 324 325 326 327
    return vmlsq_f32(VectorSum, Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2));
#else
    return VectorSum - Vector1 * Vector2;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMultiplyAddScalarFloat32(
        GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) {
#if defined(GI_NEON_INTRINSICS)
328
    return v_fma_n_f32(VectorSum, Vector, Scalar);
329
#elif defined(GI_SSE2_INTRINSICS)
330
    return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
331 332 333 334 335 336
#else
    return VectorSum + Vector * Scalar;
#endif
}

#if defined(GI_NEON_INTRINSICS)
337 338 339
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
340
        return v_fma_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i);      \
341 342 343 344 345
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
346 347
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
348
        return v_fma_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
349 350 351
    }
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
352 353 354
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)

355 356 357 358 359
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return GiMultiplyAddScalarFloat32(                                        \
                VectorSum, Vector1, GiExtractLane##i##Float32(Vector2));          \
360 361 362 363 364 365 366
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#undef GIMULTIPLYADDLANFLOAT32
#else
367 368 369 370
#define GIMULTIPLYADDLANFLOAT32(i)                                                \
    GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32(                     \
            GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
        return VectorSum + Vector1 * Vector2[i];                                  \
371 372 373 374 375 376 377 378 379
    }
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#undef GIMULTIPLYADDLANFLOAT32
#endif

GI_FORCEINLINE
380
GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
381 382 383 384 385 386 387 388 389 390 391 392 393 394
#if defined(GI_NEON64_INTRINSICS)
    return vdivq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    float32x4_t recp = vrecpeq_f32(Vector2);
    recp = vmulq_f32(vrecpsq_f32(Vector2, recp), recp);
    return vmulq_f32(Vector1, recp);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_div_ps(Vector1, Vector2);
#else
    return Vector1 / Vector2;
#endif
}

GI_FORCEINLINE
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
    return vrecpsq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t two = _mm_set1_ps(2.0f);
    return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2));
#else
    return (2.0f - Vector1 * Vector2);
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vrecpeq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t ones = _mm_set1_ps(1.0f);
    return _mm_div_ps(ones, Vector);
#else
    return 1 / Vector;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
    return vnegq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
    GI_FLOAT32_t zero = _mm_set1_ps(0.0f);
    return _mm_sub_ps(zero, Vector);
#else
    return -Vector;
#endif
}

GI_FORCEINLINE
GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vcgtq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2));
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
447
#if defined(GI_NEON_INTRINSICS)
448
    return vcleq_f32(Vector1, Vector2);
449
#elif defined(GI_SSE2_INTRINSICS)
450
    return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2));
451
#else
452 453 454 455 456
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
457 458 459 460
#endif
}

GI_FORCEINLINE
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vcltq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
    return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2));
#else
    GI_UINT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0;
    }
    return ret;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
477 478 479
#if defined(GI_SSE2_INTRINSICS)
    return _mm_and_ps(Vector1, Vector2);
#else
480
    return GiReintInt32ToFloat32(
481 482 483 484 485
            GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
486
GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
487 488 489
#if defined(GI_SSE2_INTRINSICS)
    return _mm_or_ps(Vector1, Vector2);
#else
490
    return GiReintInt32ToFloat32(
491 492 493 494 495
            GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
496
GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) {
497 498 499
#if defined(GI_SSE2_INTRINSICS)
    return _mm_andnot_ps(VectorNot, Vector);
#else
500
    return GiReintInt32ToFloat32(GiAndNotInt32(
501 502 503 504 505
            GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector)));
#endif
}

GI_FORCEINLINE
506
GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
507 508 509
#if defined(GI_SSE2_INTRINSICS)
    return _mm_xor_ps(Vector1, Vector2);
#else
510
    return GiReintInt32ToFloat32(
511 512 513 514 515
            GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}

GI_FORCEINLINE
516 517
GI_FLOAT32_t GiBlendFloat32(
        GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) {
518 519 520 521
    return GiOrFloat32(
            GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1));
}

522 523 524
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);

525
GI_FORCEINLINE
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
GI_FLOAT32_t GiBSLFloat32(
        GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vbslq_f32(Selection, Vector1, Vector2);
#else
    return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection));
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    return _mm_max_ps(Vector1, Vector2);
#else
    GI_FLOAT32_t max;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = Max(Vector1[i], Vector2[i]);
    }
    return max;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
    return _mm_min_ps(Vector1, Vector2);
#else
    GI_FLOAT32_t min;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = Min(Vector1[i], Vector2[i]);
    }
    return min;
#endif
}

GI_FORCEINLINE
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
567 568
#if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
569
#else
570 571
    //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
572 573
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
    GI_FLOAT32_t max;
574 575 576 577
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = MAX_NAN(Vector1[i], Vector2[i]);
    }
    return max;
578 579 580 581
#endif
}

GI_FORCEINLINE
582
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
583 584
#if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
585
#else
586 587
    //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
588 589
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
    GI_FLOAT32_t min;
590 591 592 593
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = MIN_NAN(Vector1[i], Vector2[i]);
    }
    return min;
594 595 596 597
#endif
}

GI_FORCEINLINE
598
GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) {
599 600 601 602 603 604
    Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value);
    Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value);
    return Value;
}

GI_FORCEINLINE
605
float GiReduceAddFloat32(GI_FLOAT32_t Vector) {
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
#if defined(GI_NEON64_INTRINSICS)
    Vector = vpaddq_f32(Vector, Vector);
    Vector = vpaddq_f32(Vector, Vector);
    return vgetq_lane_f32(Vector, 0);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
    Vector = GiAddFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
    Vector = GiAddFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = 0;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret += Vector[i];
    }
    return ret;
#endif
}

GI_FORCEINLINE
632
float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) {
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
#if defined(GI_NEON64_INTRINSICS)
    float32x2_t low = vget_low_f32(Vector);
    float32x2_t high = vget_high_f32(Vector);
    float32x2_t res = vmul_f32(low, high);
    return vget_lane_f32(res, 0) * vget_lane_f32(res, 1);
#elif defined(GI_SSE2_INTRINSICS)
    Vector = GiMultiplyFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
    Vector = GiMultiplyFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = 1;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret *= Vector[i];
    }
    return ret;
#endif
}

#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)

GI_FORCEINLINE
657
float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) {
658 659 660 661 662 663 664 665 666
#if defined(GI_NEON64_INTRINSICS)
    return vmaxvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
667
    Vector = GiMaxNanFloat32(
668
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
669
    Vector = GiMaxNanFloat32(
670 671 672 673 674
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
675
        ret = MAX_NAN(ret, Vector[i]);
676 677 678 679 680 681
    }
    return ret;
#endif
}

GI_FORCEINLINE
682
float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) {
683 684 685 686 687 688 689 690 691
#if defined(GI_NEON64_INTRINSICS)
    return vminvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
    float32x2_t VectorLow = vget_low_f32(Vector);
    float32x2_t VectorHigh = vget_high_f32(Vector);
    VectorLow = vpmin_f32(VectorLow, VectorHigh);
    VectorLow = vpmin_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
692
    Vector = GiMinNanFloat32(
693
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
694
    Vector = GiMinNanFloat32(
695 696 697 698 699
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
    return GiExtractLane0Float32(Vector);
#else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
700
        ret = MIN_NAN(ret, Vector[i]);
701 702 703 704 705
    }
    return ret;
#endif
}

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
GI_FORCEINLINE
GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) {
#if defined(GI_NEON64_INTRINSICS)
    return vabsq_f32(Vector1);
#elif defined(GI_SSE2_INTRINSICS)
    union {
        unsigned int int_val;
        float float_val;
    } value;
    value.int_val = 0x7fffffff;
    return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val));
#else
    GI_FLOAT32_t ret;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i];
    }
    return ret;
#endif
}

726
// vim: syntax=cpp.doxygen