提交 93c7e451 编写于 作者: M Megvii Engine Team

feat(arm): delete the reduant implement

GitOrigin-RevId: ff32a3dc8b33c264956c64ccf730d363057ac501
上级 e34a642b
......@@ -39,33 +39,6 @@ typedef __fp16 fp16_fix_t;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct MeanReducer;
template <>
struct MeanReducer<dt_qint8, int8_t, int32_t, true> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = 16;
int32_t res;
float coef;
MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {}
MeanReducer() = default;
void feed(const int8_t* val) {
#if MEGDNN_AARCH64
res += vaddlvq_s8(vld1q_s8(val));
#elif MEGDNN_ARMV7
auto sum = vpaddlq_s16(vpaddlq_s8(vld1q_s8(val)));
res += (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) +
vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3));
#else
#error "unsupport android arch"
#endif
}
void feed_remain(const int8_t* val) { res += *val; }
void post(int8_t* dst) {
float sum = res * coef;
*dst = std::round(sum);
}
};
template <>
struct MeanReducer<dt_quint8, uint8_t, int32_t, true> {
using ctype = uint8_t;
......@@ -97,33 +70,6 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, true> {
}
};
template <>
struct MeanReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = 4;
float32x4_t res;
float result;
float coef;
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
res = vdupq_n_f32(0.0f);
}
MeanReducer() = default;
void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); }
void feed_remain(const float* val) { result += *val; }
void post(float* dst) {
#if MEGDNN_AARCH64
result += vaddvq_f32(res);
#elif MEGDNN_ARMV7
auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1));
#else
#error "unsupport android arch"
#endif
*dst = result * coef;
}
};
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
struct MeanReducer<__fp16, __fp16, __fp16, true> {
......@@ -170,73 +116,6 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
};
#endif
template <>
struct MeanReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = 4;
float32x4_t res;
float remain;
float coef;
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
res = vdupq_n_f32(0.0f);
}
MeanReducer() = default;
void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); }
void feed_remain(const float* val) { remain += *val; }
void post(float* dst) {
res = vmulq_n_f32(res, coef);
vst1q_f32(dst, res);
}
void post_remain(float* dst) { *dst = remain * coef; }
};
template <>
struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = 16;
int32x4_t res[4];
int32_t remain;
int32_t cnt;
float coef;
float32x4_t vcoef;
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
memset(res, 0, sizeof(res));
vcoef = vdupq_n_f32(coef);
}
MeanReducer() = default;
void feed(const int8_t* val) {
const int8x16_t vval = vld1q_s8(val);
const int16x8_t vval_low = vmovl_s8(vget_low_s8(vval));
const int16x8_t vval_high = vmovl_s8(vget_high_s8(vval));
const int32x4_t vval_low_low = vmovl_s16(vget_low_s16(vval_low));
const int32x4_t vval_low_high = vmovl_s16(vget_high_s16(vval_low));
const int32x4_t vval_high_low = vmovl_s16(vget_low_s16(vval_high));
const int32x4_t vval_high_high = vmovl_s16(vget_high_s16(vval_high));
res[0] = vaddq_s32(res[0], vval_low_low);
res[1] = vaddq_s32(res[1], vval_low_high);
res[2] = vaddq_s32(res[2], vval_high_low);
res[3] = vaddq_s32(res[3], vval_high_high);
}
void feed_remain(const int8_t* val) { remain += *val; }
void post(int8_t* dst) {
for (int i = 0; i < 4; i += 2) {
float32x4_t vitem0 = vmulq_f32(vcvtq_f32_s32(res[i]), vcoef);
float32x4_t vitem1 = vmulq_f32(vcvtq_f32_s32(res[i + 1]), vcoef);
vst1_s8(dst,
(QConverter::convert<int8x8_t, float32x4x2_t>({{vitem0, vitem1}})));
dst += 8;
}
}
void post_remain(int8_t* dst) {
float sum = remain * coef;
*dst = std::round(sum);
}
};
template <>
struct MeanReducer<dt_quint8, uint8_t, int32_t, false> {
using ctype = uint8_t;
......@@ -335,8 +214,6 @@ struct minReducer;
} \
}
REDUCER_MAX_MIN_C1(max, dt_qint8, int8_t, int8_t, s, int, -128);
REDUCER_MAX_MIN_C1(min, dt_qint8, int8_t, int8_t, s, int, 127);
REDUCER_MAX_MIN_C1(max, dt_quint8, uint8_t, uint8_t, u, uint, 0);
REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
#undef REDUCER_MAX_MIN_C1
......@@ -364,45 +241,10 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
void post_remain(ctype* dst) { vst1q_lane_##_stype(dst, remain, 0); } \
}
REDUCER_MAX_MIN_C(max, dt_qint8, int8_t, int8_t, s8, int, -128);
REDUCER_MAX_MIN_C(min, dt_qint8, int8_t, int8_t, s8, int, 127);
REDUCER_MAX_MIN_C(max, dt_quint8, uint8_t, uint8_t, u8, uint, 0);
REDUCER_MAX_MIN_C(min, dt_quint8, uint8_t, uint8_t, u8, uint, 255);
#undef REDUCER_MAX_MIN_C
#define REDUCER_MAX_MIN_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
_mode##Reducer(DType, size_t) { res = vdupq_n_##_stype(_init); } \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
__stype vval = vdupq_n_##_stype(*val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void post(ctype* dst) { \
auto val = v##_mode##_##_stype( \
vget_low_##_stype(res), vget_high_##_stype(res)); \
using namespace std; \
*dst = _mode({vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1)}); \
} \
}
REDUCER_MAX_MIN_C1(
max, dt_float32, float, float, f32, float32x4_t, 4,
std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C1(
min, dt_float32, float, float, f32, float32x4_t, 4,
std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C1
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define REDUCER_MAX_MIN_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
......@@ -440,38 +282,6 @@ REDUCER_MAX_MIN_C1(
#undef REDUCER_MAX_MIN_C1
#endif
#define REDUCER_MAX_MIN_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, false> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
ctype remain; \
_mode##Reducer(DType, size_t) { \
res = vdupq_n_##_stype(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(ctype* dst) { vst1q_##_stype(dst, res); } \
void post_remain(ctype* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C(
max, dt_float32, float, float, f32, float32x4_t, 4,
std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C(
min, dt_float32, float, float, f32, float32x4_t, 4,
std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define REDUCER_MAX_MIN_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
......@@ -513,45 +323,6 @@ struct SumReducer;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct ProductReducer;
#define REDUCER_SUM_PRODUCT_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
ctype remain; \
_mode##Reducer(DType, size_t) { \
res = vdupq_n_##_stype(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_act##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
using namespace std; \
auto op = _op<ctype>(); \
remain = op(remain, *val); \
} \
void post(ctype* dst) { \
using namespace std; \
auto val = v##_act##_##_stype( \
vget_low_##_stype(res), vget_high_##_stype(res)); \
auto op = _op<ctype>(); \
*dst = \
op(remain, \
op(vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1))); \
} \
}
REDUCER_SUM_PRODUCT_C1(
Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus);
REDUCER_SUM_PRODUCT_C1(
Product, dt_float32, float, float, f32, float32x4_t, 4, 1.0f, mul, multiplies);
#undef REDUCER_SUM_PRODUCT_C1
#define REDUCER_SUM_PRODUCT_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
template <> \
......@@ -578,9 +349,6 @@ REDUCER_SUM_PRODUCT_C1(
void post_remain(ctype* dst) { *dst = remain; } \
}
REDUCER_SUM_PRODUCT_C(Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus);
REDUCER_SUM_PRODUCT_C(
Product, dt_float32, float, float, f32, float32x4_t, 4, 1, mul, multiplies);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
REDUCER_SUM_PRODUCT_C(Sum, __fp16, __fp16, __fp16, f16, float16x8_t, 8, 0, add, plus);
REDUCER_SUM_PRODUCT_C(
......@@ -633,59 +401,6 @@ REDUCER_SUM_PRODUCT_C1(
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct SumSqrReducer;
template <>
struct SumSqrReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = 4;
float32x4_t res;
float result;
SumSqrReducer(DType, size_t cnt) : result(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = vdupq_n_f32(0.0f);
}
SumSqrReducer() = default;
void feed(const float* val) {
float32x4_t vval = vld1q_f32(val);
res = vaddq_f32(vmulq_f32(vval, vval), res);
}
void feed_remain(const float* val) {
float vval = *val;
result += vval * vval;
}
void post(float* dst) {
#if MEGDNN_AARCH64
result += vaddvq_f32(res);
#elif MEGDNN_ARMV7
auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1));
#else
#error "unsupport android arch"
#endif
*dst = result;
}
};
template <>
struct SumSqrReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = 4;
float32x4_t res;
float remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = vdupq_n_f32(0.0f);
}
SumSqrReducer() = default;
void feed(const float* val) {
float32x4_t vval = vld1q_f32(val);
res = vaddq_f32(vmulq_f32(vval, vval), res);
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
void post(float* dst) { vst1q_f32(dst, res); }
void post_remain(float* dst) { *dst = remain; }
};
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
struct SumSqrReducer<__fp16, __fp16, __fp16, true> {
......@@ -873,14 +588,12 @@ void ReduceImpl::exec(
default: \
break; \
}
if (src.layout.is_contiguous() &&
src.layout.dtype.category() == DTypeCategory::QUANTIZED &&
param().data_type == param::Reduce::DataType::DEFAULT) {
DType src_type = src.layout.dtype;
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t)
}
if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
DISPATCH_MODE_QUANTIZED(dt_quint8, uint8_t, int32_t)
}
......@@ -889,9 +602,7 @@ void ReduceImpl::exec(
src.layout.dtype.category() == DTypeCategory::FLOAT &&
param().data_type == param::Reduce::DataType::DEFAULT) {
DType src_type = src.layout.dtype;
if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
DISPATCH_MODE_FLOAT(dt_float32, float, float)
}
MEGDNN_MARK_USED_VAR(src_type);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if (src.layout.dtype.enumv() == DTypeEnum::Float16) {
DNN_INC_FLOAT16(DISPATCH_MODE_FLOAT(__fp16, __fp16, __fp16));
......
......@@ -20,13 +20,19 @@
#else
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#define GI_TARGET_ARM
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
#endif
#endif
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
#define GI_TARGET_X86
#endif
#if defined(__arm__) || defined(__aarch64__)
#define GI_TARGET_ARM
#endif
#ifdef _WIN32
......
......@@ -454,22 +454,22 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1));
}
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
#else
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
GI_FLOAT32 max;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
max[i] = MAX_NAN(Vector1[i], Vector2[i]);
}
return max;
#else
return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2);
#endif
}
......@@ -478,18 +478,14 @@ GI_FLOAT32
GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_min_ps(Vector1, Vector2);
#else
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
GI_FLOAT32 min;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
min[i] = MIN_NAN(Vector1[i], Vector2[i]);
}
return min;
#else
return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1);
#endif
}
......@@ -563,11 +559,6 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
VectorLow = vpmax_f32(VectorLow, VectorHigh);
VectorLow = vpmax_f32(VectorLow, VectorHigh);
return vget_lane_f32(VectorLow, 0);
#elif defined(GI_VSX_INTRINSICS)
Vector = GiMaximumFloat32(
Vector, GI_FLOAT32(vec_splat((__vector long long)Vector, 1)));
Vector = GiMaximumFloat32(Vector, vec_splat(Vector, 1));
return Vector[0];
#elif defined(GI_SSE2_INTRINSICS)
Vector = GiMaximumFloat32(
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
......@@ -577,7 +568,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
#else
float ret = Vector[0];
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret = Max(ret, Vector[i]);
ret = MAX_NAN(ret, Vector[i]);
}
return ret;
#endif
......@@ -602,7 +593,7 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
#else
float ret = Vector[0];
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret = Min(ret, Vector[i]);
ret = MIN_NAN(ret, Vector[i]);
}
return ret;
#endif
......
......@@ -416,7 +416,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
int16_t GiReduceAddInt8(GI_INT8 Vector) {
int32_t GiReduceAddInt8(GI_INT8 Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vaddlvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -467,8 +467,10 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
#elif defined(GI_NEON32_INTRINSICS)
int8x8_t VectorLow = vget_low_s8(Vector);
int8x8_t VectorHigh = vget_high_s8(Vector);
VectorLow = vpmin_s8(VectorLow, VectorHigh);
VectorLow = vpmin_s8(VectorLow, VectorHigh);
VectorLow = vpmax_s8(VectorLow, VectorHigh);
VectorLow = vpmax_s8(VectorLow, VectorLow);
VectorLow = vpmax_s8(VectorLow, VectorLow);
VectorLow = vpmax_s8(VectorLow, VectorLow);
return vget_lane_s8(VectorLow, 0);
#elif defined(GI_SSE42_INTRINSICS)
__m128i v0 = _mm_cvtepi8_epi16(Vector);
......@@ -514,7 +516,9 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
int8x8_t VectorLow = vget_low_s8(Vector);
int8x8_t VectorHigh = vget_high_s8(Vector);
VectorLow = vpmin_s8(VectorLow, VectorHigh);
VectorLow = vpmin_s8(VectorLow, VectorHigh);
VectorLow = vpmin_s8(VectorLow, VectorLow);
VectorLow = vpmin_s8(VectorLow, VectorLow);
VectorLow = vpmin_s8(VectorLow, VectorLow);
return vget_lane_s8(VectorLow, 0);
#elif defined(GI_SSE42_INTRINSICS)
__m128i v0 = _mm_cvtepi8_epi16(Vector);
......
......@@ -145,7 +145,7 @@ struct minReducer;
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(vval, res); \
res = Gi##_Mode##imumFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
......@@ -172,7 +172,7 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(vval, res); \
res = Gi##_Mode##imumFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
using namespace std; \
......@@ -200,7 +200,7 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
} \
void feed_remain(const int8_t* val) { \
GI_INT8 vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(vval, res); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
}
......@@ -223,7 +223,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册