/** * \file dnn/src/fallback/general_intrinsic/gi_common.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #pragma once #include "math.h" #include "stdint.h" #include "string.h" #if defined(_WIN32) #include #include #else #if defined(__arm__) || defined(__aarch64__) #include "src/arm_common/simd_macro/marm_neon.h" #endif #if defined(__x86_64__) || defined(__i386__) #include #include #endif #endif #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) #define GI_TARGET_X86 #endif #if defined(__arm__) || defined(__aarch64__) #define GI_TARGET_ARM #endif #ifdef _WIN32 //! GI stand for general intrinsic #define GI_DECLSPEC_ALIGN(variable, alignment) DECLSPEC_ALIGN(alignment) variable #else #define GI_DECLSPEC_ALIGN(variable, alignment) \ variable __attribute__((aligned(alignment))) #endif #if defined(_MSC_VER) #define GI_FORCEINLINE __forceinline #else #define GI_FORCEINLINE __attribute__((always_inline)) inline #endif #if defined(_MSC_VER) #define GI_INTERNAL_DATA extern "C" #else #define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden"))) #endif #if defined(GI_TARGET_ARM) #define GI_NEON_INTRINSICS #if defined(__aarch64__) #define GI_NEON64_INTRINSICS #define GI_NEON32_INTRINSICS #else #define GI_NEON32_INTRINSICS #endif #elif defined(GI_TARGET_X86) //#if defined(__FMA__) //#define GI_FMA_INTRINSICS //#define GI_AVX2_INTRINSICS //#define GI_AVX_INTRINSICS //#elif defined(__AVX2__) //#define GI_AVX2_INTRINSICS //#define GI_AVX_INTRINSICS //#elif defined(__AVX__) //#define GI_AVX_INTRINSICS #if defined(__SSE4_2__) #define GI_SSE42_INTRINSICS #define GI_SSE2_INTRINSICS #elif defined(__SSE2__) #define GI_SSE2_INTRINSICS #endif #endif #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ defined(GI_FMA_INTRINSICS) typedef __m256 GI_FLOAT32_t; typedef __m256i GI_UINT8_t; typedef __m256i GI_INT8_t; typedef __m256i GI_INT16_t; typedef __m256i GI_INT32_t; typedef __m256i GI_UINT32_t; #elif defined(GI_NEON_INTRINSICS) typedef float32x4_t GI_FLOAT32_t; typedef uint8x16_t GI_UINT8_t; typedef int8x16_t GI_INT8_t; typedef int16x8_t GI_INT16_t; typedef int32x4_t GI_INT32_t; typedef uint32x4_t GI_UINT32_t; #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) typedef __m128 GI_FLOAT32_t; typedef __m128i GI_UINT8_t; typedef __m128i GI_INT8_t; typedef __m128i GI_INT16_t; typedef __m128i GI_INT32_t; typedef __m128i GI_UINT32_t; #else typedef float GI_FLOAT32_t __attribute__((vector_size(16))); typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); typedef int8_t GI_INT8_t __attribute__((vector_size(16))); typedef int16_t GI_INT16_t __attribute__((vector_size(16))); typedef int32_t GI_INT32_t __attribute__((vector_size(16))); typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); #endif //! general intrinsic support dynamic length simd, if avx or avx2 the simd //! length is 256 #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ defined(GI_FMA_INTRINSICS) //! if neon and sse the simd lenght is 128 #define GI_SIMD_LEN 256 #define GI_SIMD_LEN_BYTE 32 #elif defined(GI_NEON_INTRINSICS) || defined(GI_SSE2_INTRINSICS) || \ defined(GI_SSE42_INTRINSICS) #define GI_SIMD_LEN 128 #define GI_SIMD_LEN_BYTE 16 #else //! if no simd hardware support, the simd is implemented by C, default set to //! 128 #define GI_SIMD_LEN 128 #define GI_SIMD_LEN_BYTE 16 #endif #define Max(a, b) (a) > (b) ? (a) : (b) #define Min(a, b) (a) < (b) ? (a) : (b) #if defined(GI_NEON_INTRINSICS) #if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS) #define v_fma_ps_f32(c, b, a) vfmaq_f32((c), (b), (a)) #define v_fma_n_f32(c, b, a) vfmaq_n_f32((c), (b), (a)) #define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane)) #else #define v_fma_ps_f32(c, b, a) vmlaq_f32((c), (b), (a)) #define v_fma_n_f32(c, b, a) vmlaq_n_f32((c), (b), (a)) #define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane)) #endif #endif typedef struct { GI_INT32_t val[2]; } GI_INT32_V2_t; typedef struct { GI_INT32_t val[4]; } GI_INT32_V4_t; typedef struct { GI_FLOAT32_t val[2]; } GI_FLOAT32_V2_t; typedef struct { GI_FLOAT32_t val[4]; } GI_FLOAT32_V4_t; typedef struct { GI_INT16_t val[2]; } GI_INT16_V2_t; typedef struct { GI_INT8_t val[2]; } GI_INT8_V2_t; GI_FORCEINLINE GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vandq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_and_si128(Vector1, Vector2); #else return Vector1 & Vector2; #endif } GI_FORCEINLINE GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vorrq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_or_si128(Vector1, Vector2); #else return Vector1 | Vector2; #endif } GI_FORCEINLINE GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { #if defined(GI_NEON_INTRINSICS) return vandq_s32(vmvnq_s32(VectorNot), Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_andnot_si128(VectorNot, Vector); #else return (~VectorNot) & Vector; #endif } GI_FORCEINLINE GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return veorq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_xor_si128(Vector1, Vector2); #else return Vector1 ^ Vector2; #endif } GI_FORCEINLINE GI_FLOAT32_t GiBroadcastFloat32(float Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_f32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_ps(Value); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = Value; } return ret; #endif } GI_FORCEINLINE GI_INT32_t GiBroadcastInt32(int32_t Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_s32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi32(Value); #else GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { ret[i] = Value; } return ret; #endif } GI_FORCEINLINE GI_INT8_t GiBroadcastInt8(int8_t Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_s8(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi8(Value); #else GI_INT8_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { ret[i] = Value; } return ret; #endif } __attribute__((unused)) const GI_INT8_t vzero_int8 = GiBroadcastInt8(0); __attribute__((unused)) const GI_INT32_t vzero = GiBroadcastInt32(0); __attribute__((unused)) const GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); __attribute__((unused)) const GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); __attribute__((unused)) const GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); __attribute__((unused)) const GI_FLOAT32_t vfmin_int8 = GiBroadcastFloat32(-128.0f); __attribute__((unused)) const GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); // vim: syntax=cpp.doxygen