提交 a017bed3 编写于 作者: M Megvii Engine Team

fix(fallback): reman general intrinsic type and add more intrinsic

GitOrigin-RevId: 37409bae9a9a0e8c324f546e2ed985566cca18c7
上级 fd6f8e58
......@@ -82,29 +82,33 @@
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
typedef __m256 GI_FLOAT32;
typedef __m256i GI_UINT8;
typedef __m256i GI_INT8;
typedef __m256i GI_INT16;
typedef __m256i GI_INT32;
typedef __m256 GI_FLOAT32_t;
typedef __m256i GI_UINT8_t;
typedef __m256i GI_INT8_t;
typedef __m256i GI_INT16_t;
typedef __m256i GI_INT32_t;
typedef __m256i GI_UINT32_t;
#elif defined(GI_NEON_INTRINSICS)
typedef float32x4_t GI_FLOAT32;
typedef uint8x16_t GI_UINT8;
typedef int8x16_t GI_INT8;
typedef int16x8_t GI_INT16;
typedef int32x4_t GI_INT32;
typedef float32x4_t GI_FLOAT32_t;
typedef uint8x16_t GI_UINT8_t;
typedef int8x16_t GI_INT8_t;
typedef int16x8_t GI_INT16_t;
typedef int32x4_t GI_INT32_t;
typedef uint32x4_t GI_UINT32_t;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
typedef __m128 GI_FLOAT32;
typedef __m128i GI_UINT8;
typedef __m128i GI_INT8;
typedef __m128i GI_INT16;
typedef __m128i GI_INT32;
typedef __m128 GI_FLOAT32_t;
typedef __m128i GI_UINT8_t;
typedef __m128i GI_INT8_t;
typedef __m128i GI_INT16_t;
typedef __m128i GI_INT32_t;
typedef __m128i GI_UINT32_t;
#else
typedef float GI_FLOAT32 __attribute__((vector_size(16)));
typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
typedef int8_t GI_INT8 __attribute__((vector_size(16)));
typedef int16_t GI_INT16 __attribute__((vector_size(16)));
typedef int32_t GI_INT32 __attribute__((vector_size(16)));
typedef float GI_FLOAT32_t __attribute__((vector_size(16)));
typedef uint8_t GI_UINT8_t __attribute__((vector_size(16)));
typedef int8_t GI_INT8_t __attribute__((vector_size(16)));
typedef int16_t GI_INT16_t __attribute__((vector_size(16)));
typedef int32_t GI_INT32_t __attribute__((vector_size(16)));
typedef uint32_t GI_UINT32_t __attribute__((vector_size(16)));
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
......@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
#define Min(a, b) (a) < (b) ? (a) : (b)
typedef struct {
GI_INT32 val[2];
} GI_INT32_V2;
GI_INT32_t val[2];
} GI_INT32_V2_t;
typedef struct {
GI_INT32 val[4];
} GI_INT32_V4;
GI_INT32_t val[4];
} GI_INT32_V4_t;
typedef struct {
GI_FLOAT32 val[2];
} GI_FLOAT32_V2;
GI_FLOAT32_t val[2];
} GI_FLOAT32_V2_t;
typedef struct {
GI_FLOAT32 val[4];
} GI_FLOAT32_V4;
GI_FLOAT32_t val[4];
} GI_FLOAT32_V4_t;
typedef struct {
GI_INT16_t val[2];
} GI_INT16_V2_t;
typedef struct {
GI_INT8_t val[2];
} GI_INT8_V2_t;
GI_FORCEINLINE
GI_INT32
GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vorrq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s32(vmvnq_s32(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return veorq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......
......@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
GI_INT32 res[4];
GI_INT32_t res[4];
int32_t remain;
int32_t cnt;
float coef;
GI_FLOAT32 vcoef;
GI_FLOAT32_t vcoef;
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
memset(res, 0, sizeof(res));
vcoef = GiBroadcastFloat32(coef);
}
MeanReducer() = default;
void feed(const int8_t* val) {
const GI_INT8 vval = GiLoadInt8(val);
const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
const GI_INT16 vval_high = GiMoveHighLongInt8(vval);
const GI_INT8_t vval = GiLoadInt8(val);
const GI_INT16_t vval_low = GiMoveLowLongInt8(vval);
const GI_INT16_t vval_high = GiMoveHighLongInt8(vval);
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);
const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low);
const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low);
const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high);
res[0] = GiAddInt32(res[0], vval_low_low);
res[1] = GiAddInt32(res[1], vval_low_high);
......@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
void feed_remain(const int8_t* val) { remain += *val; }
void post(int8_t* dst) {
for (int i = 0; i < 4; i += 2) {
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GiStoreLowInt8(
dst,
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(
{{vitem0, vitem1}})));
dst += 8;
}
}
......@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float result;
float coef;
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
......@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float remain;
float coef;
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
......@@ -140,30 +140,33 @@ struct minReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(res, vval); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = Gi##_Mode##imumFloat32(vval, res); \
res = Gi##_Mode##NanFloat32(vval, res); \
} \
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \
}
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C1
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(res, vval); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _mode(*val, remain); \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
......@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C
#undef Max_NAN
#undef Min_NAN
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
GI_INT8_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8 vval = GiBroadcastInt8(*val); \
GI_INT8_t vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
......@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
GI_INT8_t res; \
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastInt8(_init); \
......@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void feed_remain(const int8_t* val) { \
......@@ -248,7 +253,7 @@ struct ProductReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -256,7 +261,7 @@ struct ProductReducer;
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
......@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
......@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float result;
SumSqrReducer(DType, size_t cnt) : result(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
......@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) {
......@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
......@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册