提交 f12b75c0 编写于 作者: M Megvii Engine Team

perf(dnn/fallback): optimize some corner case in reduce

GitOrigin-RevId: 1185594301dfb58d07b8b92212a8c7881fb204de
上级 35cf0422
......@@ -97,6 +97,7 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
}
MeanReducer() = default;
void feed(const ctype* val) { res = vaddq_f16(vld1q_f16(val), res); }
void feed_vector(const float16x8_t& vval) { res = vaddq_f16(vval, res); }
void feed_remain(const ctype* val) { remain += *val; }
void post(ctype* dst) {
res = vmulq_n_f16(res, coef);
......@@ -127,8 +128,8 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, false> {
vcoef = vdupq_n_f32(coef);
}
MeanReducer() = default;
void feed(const uint8_t* val) {
const uint8x16_t vval = vld1q_u8(val);
void feed(const uint8_t* val) { feed_vector(vld1q_u8(val)); }
void feed_vector(const uint8x16_t& vval) {
const uint16x8_t vval_low = vmovl_u8(vget_low_u8(vval));
const uint16x8_t vval_high = vmovl_u8(vget_high_u8(vval));
......@@ -219,8 +220,8 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
remain = vdupq_n_##_stype(_init); \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype##8x16_t vval = vld1q_##_stype(val); \
void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \
void inline feed_vector(const __stype##8x16_t & vval) { \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
......@@ -286,8 +287,8 @@ REDUCER_MAX_MIN_C1(
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \
void inline feed_vector(const __stype& vval) { \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
......@@ -326,8 +327,8 @@ struct ProductReducer;
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); } \
void feed_vector(const __stype& vval) { \
res = v##_act##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
......@@ -435,8 +436,8 @@ struct SumSqrReducer<__fp16, __fp16, __fp16, false> {
fp16_fix_t remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) { res = vdupq_n_f16(0.0f); }
SumSqrReducer() = default;
void feed(const __fp16* val) {
float16x8_t vval = vld1q_f16(val);
void feed(const __fp16* val) { feed_vector(vld1q_f16(val)); }
void inline feed_vector(const float16x8_t& vval) {
res = vaddq_f16(vmulq_f16(vval, vval), res);
}
void feed_remain(const __fp16* val) { remain += (*val) * (*val); }
......@@ -504,6 +505,60 @@ struct Exec<Reducer, false> {
}
};
template <typename Reducer, typename dtype, size_t B>
struct ExecC1SmallB {
static void do_reduce(
const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
};
#define ImplementC1SmallB(_ctype, _simd_prefix, _simd_suffix) \
template <typename Reducer, size_t B> \
struct ExecC1SmallB<Reducer, _ctype, B> { \
static void do_reduce( \
const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
size_t) { \
size_t a = 0; \
for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
if (B == 4) { \
_simd_prefix##x4_t data_v4 = vld4q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v4.val[0]); \
reducer.feed_vector(data_v4.val[1]); \
reducer.feed_vector(data_v4.val[2]); \
reducer.feed_vector(data_v4.val[3]); \
} \
if (B == 3) { \
_simd_prefix##x3_t data_v3 = vld3q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v3.val[0]); \
reducer.feed_vector(data_v3.val[1]); \
reducer.feed_vector(data_v3.val[2]); \
} \
if (B == 2) { \
_simd_prefix##x2_t data_v2 = vld2q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v2.val[0]); \
reducer.feed_vector(data_v2.val[1]); \
} \
reducer.post(dst); \
dst += Reducer::SIMD_WIDTH; \
} \
for (; a < A; a++) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
for (size_t i = 0; i < B; i++) \
reducer.feed_remain(src_ptr + i); \
reducer.post_remain(dst); \
dst++; \
} \
} \
}
ImplementC1SmallB(uint8_t, uint8x16, u8);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
ImplementC1SmallB(__fp16, float16x8, f16);
#endif
} // anonymous namespace
void ReduceImpl::exec(
......@@ -513,31 +568,40 @@ void ReduceImpl::exec(
reduce::get_ABC(src.layout, A, B, C, param().axis);
bool execed = false;
using Mode = param::Reduce::Mode;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
#define DISPATCH_FUNC(Reducer, _dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<_dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<_dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (src.layout.dtype.category() != DTypeCategory::FLOAT) { \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
} \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<_dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
}
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \
......
......@@ -36,7 +36,7 @@ void conv_stride2::do_conv_2x2_stride2(
rep(i, nn) {
GI_FLOAT32_t _outp = GiLoadFloat32(outptr);
GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0);
GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0);
GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6
GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7
......@@ -44,7 +44,7 @@ void conv_stride2::do_conv_2x2_stride2(
_outp = GiSimdFmaLane(_outp, _r00, _k0123, 0);
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);
GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1);
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0);
GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1);
......@@ -94,8 +94,8 @@ void conv_stride2::do_conv_3x3_stride2(
rep(i, nn) {
GI_FLOAT32_t _outp = GiLoadFloat32(outptr);
GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0);
GI_FLOAT32_V2_t _r0n = GiLd2qFloat32(r0 + 8);
GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0);
GI_FLOAT32_V2_t _r0n = GiLoadUzipFloat32V2(r0 + 8);
GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0); // 0 2 4 6
GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1); // 1 3 5 7
......@@ -106,8 +106,8 @@ void conv_stride2::do_conv_3x3_stride2(
_outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);
_outp = GiSimdFmaLane(_outp, _r02, _k0123, 2);
GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1);
GI_FLOAT32_V2_t _r1n = GiLd2qFloat32(r1 + 8);
GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8);
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0);
GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1);
......@@ -118,8 +118,8 @@ void conv_stride2::do_conv_3x3_stride2(
_outp = GiSimdFmaLane(_outp, _r11, _k3456, 1);
_outp = GiSimdFmaLane(_outp, _r12, _k3456, 2);
GI_FLOAT32_V2_t _r2 = GiLd2qFloat32(r2);
GI_FLOAT32_V2_t _r2n = GiLd2qFloat32(r2 + 8);
GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2);
GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8);
GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r2, 0);
GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r2, 1);
......@@ -176,8 +176,8 @@ void conv_stride2::do_conv_5x5_stride2(
rep(i, nn) {
GI_FLOAT32_t _sum = GiLoadFloat32(outptr);
GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0);
GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8);
GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0);
GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8);
GI_FLOAT32_t _r0_8101214 =
GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14
GI_FLOAT32_t _r0_9111315 =
......@@ -190,8 +190,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _r03 = GiExtqFloat32(_r01, _r0_9111315, 1); // 3 5 7 9
GI_FLOAT32_t _r04 = GiExtqFloat32(_r00, _r0_8101214, 2); // 4 6 8 10
GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1);
GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8);
GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8);
GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0);
GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1);
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0);
......@@ -200,8 +200,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _r13 = GiExtqFloat32(_r11, _r1_9111315, 1);
GI_FLOAT32_t _r14 = GiExtqFloat32(_r10, _r1_8101214, 2);
GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2);
GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8);
GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2);
GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8);
GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0);
GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1);
GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0);
......@@ -210,8 +210,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _r23 = GiExtqFloat32(_r21, _r2_9111315, 1);
GI_FLOAT32_t _r24 = GiExtqFloat32(_r20, _r2_8101214, 2);
GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3);
GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8);
GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3);
GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8);
GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0);
GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1);
GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0);
......@@ -220,8 +220,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t _r33 = GiExtqFloat32(_r31, _r3_9111315, 1);
GI_FLOAT32_t _r34 = GiExtqFloat32(_r30, _r3_8101214, 2);
GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4);
GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8);
GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4);
GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8);
GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0);
GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1);
GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0);
......@@ -315,8 +315,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k0123 = GiLoadFloat32(k0);
GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4);
GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0);
GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8);
GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0);
GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8);
GI_FLOAT32_t _r0_8101214 =
GiGetSubVectorFloat32V2(_r00nx2, 0); // 8 10 12 14
GI_FLOAT32_t _r0_9111315 =
......@@ -342,8 +342,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k78910 = GiLoadFloat32(k1);
GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4);
GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1);
GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8);
GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1);
GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8);
GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0);
GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1);
GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0);
......@@ -365,8 +365,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2);
GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4);
GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2);
GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8);
GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2);
GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8);
GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0);
GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1);
GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0);
......@@ -388,8 +388,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3);
GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4);
GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3);
GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8);
GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3);
GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8);
GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0);
GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1);
GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0);
......@@ -411,8 +411,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4);
GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4);
GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4);
GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8);
GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4);
GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8);
GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0);
GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1);
GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0);
......@@ -434,8 +434,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5);
GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4);
GI_FLOAT32_V2_t _r50_02461357 = GiLd2qFloat32(r5);
GI_FLOAT32_V2_t _r50nx2 = GiLd2qFloat32(r5 + 8);
GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5);
GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8);
GI_FLOAT32_t _r5_8101214 = GiGetSubVectorFloat32V2(_r50nx2, 0);
GI_FLOAT32_t _r5_9111315 = GiGetSubVectorFloat32V2(_r50nx2, 1);
GI_FLOAT32_t _r50 = GiGetSubVectorFloat32V2(_r50_02461357, 0);
......@@ -457,8 +457,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6);
GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3);
GI_FLOAT32_V2_t _r60_02461357 = GiLd2qFloat32(r6);
GI_FLOAT32_V2_t _r60nx2 = GiLd2qFloat32(r6 + 8);
GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6);
GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8);
GI_FLOAT32_t _r6_8101214 = GiGetSubVectorFloat32V2(_r60nx2, 0);
GI_FLOAT32_t _r6_9111315 = GiGetSubVectorFloat32V2(_r60nx2, 1);
GI_FLOAT32_t _r60 = GiGetSubVectorFloat32V2(_r60_02461357, 0);
......
......@@ -151,6 +151,8 @@ typedef int32x4x2_t GI_INT32_V2_t;
typedef int32x4x4_t GI_INT32_V4_t;
typedef int16x8x2_t GI_INT16_V2_t;
typedef int8x16x2_t GI_INT8_V2_t;
typedef int8x16x3_t GI_INT8_V3_t;
typedef int8x16x4_t GI_INT8_V4_t;
typedef int64x2_t GI_INT64_t;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
......@@ -302,6 +304,8 @@ typedef vint32m1x2_t GI_INT32_V2_t;
typedef vint32m1x4_t GI_INT32_V4_t;
typedef vint16m1x2_t GI_INT16_V2_t;
typedef vint8m1x2_t GI_INT8_V2_t;
typedef vint8m1x3_t GI_INT8_V3_t;
typedef vint8m1x4_t GI_INT8_V4_t;
//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as
//! a workaround, we use vfloat32m1_t instead
typedef vfloat32m1_t float32x2_t;
......@@ -390,6 +394,14 @@ typedef struct {
GI_INT8_t val[2];
} GI_INT8_V2_t;
typedef struct {
GI_INT8_t val[3];
} GI_INT8_V3_t;
typedef struct {
GI_INT8_t val[4];
} GI_INT8_V4_t;
#endif
//! variable length type intrinsic can not be a member of c++ class
//! caused by can not do sizeof at build stage, for example RVV and SVE
......@@ -417,6 +429,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index)
#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index)
#define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index)
#define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index)
//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s)
......@@ -570,6 +584,8 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorInt16V2(s, index) s.val[index]
#define GiGetSubVectorInt8V2(s, index) s.val[index]
#define GiGetSubVectorInt8V3(s, index) s.val[index]
#define GiGetSubVectorInt8V4(s, index) s.val[index]
//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s
......
......@@ -381,31 +381,6 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) {
#endif
}
GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld2q_f32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_V2_t v;
v.val[0] = GiLoadFloat32(Buffer);
v.val[1] = GiLoadFloat32((Buffer + 4));
v = GiUzpqFloat32(v.val[0], v.val[1]);
return v;
#elif defined(GI_RVV_INTRINSICS)
return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
#else
GI_FLOAT32_V2_t ret;
ret.val[0][0] = Buffer[0];
ret.val[0][1] = Buffer[2];
ret.val[0][2] = Buffer[4];
ret.val[0][3] = Buffer[6];
ret.val[1][0] = Buffer[1];
ret.val[1][1] = Buffer[3];
ret.val[1][2] = Buffer[5];
ret.val[1][3] = Buffer[7];
return ret;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GiExtqFloat32(a, b, n) vextq_f32(a, b, n)
#elif defined(GI_SSE2_INTRINSICS)
......@@ -1709,6 +1684,31 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
#endif
}
GI_FORCEINLINE GI_FLOAT32_V2_t GiLoadUzipFloat32V2(const float* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld2q_f32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_V2_t v;
v.val[0] = GiLoadFloat32(Buffer);
v.val[1] = GiLoadFloat32((Buffer + 4));
v = GiUzpqFloat32(v.val[0], v.val[1]);
return v;
#elif defined(GI_RVV_INTRINSICS)
return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
#else
GI_FLOAT32_V2_t ret;
ret.val[0][0] = Buffer[0];
ret.val[0][1] = Buffer[2];
ret.val[0][2] = Buffer[4];
ret.val[0][3] = Buffer[6];
ret.val[1][0] = Buffer[1];
ret.val[1][1] = Buffer[3];
ret.val[1][2] = Buffer[5];
ret.val[1][3] = Buffer[7];
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) {
#if defined(GI_NEON_INTRINSICS)
......
......@@ -86,6 +86,152 @@ GI_INT8_t GiLoadInt8(const void* Buffer) {
#endif
}
GI_FORCEINLINE
GI_INT8_V2_t GiLoadUzipInt8V2(const void* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld2q_s8((int8_t*)Buffer);
#elif defined(GI_SSE42_INTRINSICS)
GI_INT8_t v0, v1;
v0 = _mm_loadu_si128((const __m128i*)Buffer);
v1 = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
GI_INT8_V2_t ret;
v0 = _mm_shuffle_epi8(
v0, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
v1 = _mm_shuffle_epi8(
v1, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
ret.val[0] = _mm_unpacklo_epi64(v0, v1);
ret.val[1] = _mm_unpackhi_epi64(v0, v1);
return ret;
#elif defined(GI_RVV_INTRINSICS)
return vlseg2e8_v_i8m1x2((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
#else
int8_t data[2 * GI_SIMD_LEN_BYTE];
const int8_t* ptr = (int8_t*)Buffer;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
data[i] = ptr[2 * i];
data[GI_SIMD_LEN_BYTE + i] = ptr[2 * i + 1];
}
GI_INT8_V2_t ret;
ret.val[0] = GiLoadInt8(data);
ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE);
return ret;
#endif
}
GI_FORCEINLINE
GI_INT8_V3_t GiLoadUzipInt8V3(const void* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld3q_s8((int8_t*)Buffer);
#elif defined(GI_SSE42_INTRINSICS)
GI_INT8_V3_t v;
__m128i tmp0, tmp1, tmp2, tmp3;
static const int8_t mask8_0[16] = {0, 3, 6, 9, 12, 15, 1, 4,
7, 10, 13, 2, 5, 8, 11, 14};
static const int8_t mask8_1[16] = {2, 5, 8, 11, 14, 0, 3, 6,
9, 12, 15, 1, 4, 7, 10, 13};
static const int8_t mask8_2[16] = {1, 4, 7, 10, 13, 2, 5, 8,
11, 14, 0, 3, 6, 9, 12, 15};
v.val[0] = _mm_loadu_si128((const __m128i*)Buffer);
v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32));
tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0);
tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1);
tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2);
tmp3 = _mm_slli_si128(tmp0, 10);
tmp3 = _mm_alignr_epi8(tmp1, tmp3, 10); // a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
tmp3 = _mm_slli_si128(tmp3, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
tmp3 = _mm_srli_si128(tmp3, 5); // a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
v.val[0] = _mm_slli_si128(tmp2, 11); // 0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
v.val[0] = _mm_or_si128(v.val[0], tmp3);
tmp3 = _mm_slli_si128(tmp0, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
tmp3 = _mm_srli_si128(tmp3, 11); // a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
v.val[1] = _mm_srli_si128(tmp1, 5); // b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
v.val[1] = _mm_or_si128(v.val[1], tmp3);
v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
v.val[1] = _mm_srli_si128(v.val[1], 5); // a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
tmp3 = _mm_srli_si128(tmp2, 5); // c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
tmp3 = _mm_slli_si128(tmp3, 11); // 0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
v.val[1] = _mm_or_si128(v.val[1], tmp3);
tmp3 = _mm_srli_si128(tmp2, 10); // c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
tmp3 = _mm_slli_si128(tmp3, 10); // 0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
v.val[2] = _mm_srli_si128(tmp1, 11); // b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
v.val[2] = _mm_slli_si128(v.val[2], 5); // 0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
v.val[2] =
_mm_or_si128(v.val[2], tmp3); // 0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
tmp0 = _mm_srli_si128(tmp0, 11); // a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
v.val[2] = _mm_or_si128(v.val[2], tmp0);
return v;
#elif defined(GI_RVV_INTRINSICS)
return vlseg3e8_v_i8m1x3((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
#else
int8_t data[3 * GI_SIMD_LEN_BYTE];
const int8_t* ptr = (int8_t*)Buffer;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
data[i] = ptr[3 * i];
data[GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 1];
data[2 * GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 2];
}
GI_INT8_V3_t ret;
ret.val[0] = GiLoadInt8(data);
ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE);
ret.val[2] = GiLoadInt8(data + 2 * GI_SIMD_LEN_BYTE);
return ret;
#endif
}
GI_FORCEINLINE
GI_INT8_V4_t GiLoadUzipInt8V4(const void* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld4q_s8((int8_t*)Buffer);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT8_V4_t v;
__m128i tmp3, tmp2, tmp1, tmp0;
v.val[0] = _mm_loadu_si128((const __m128i*)Buffer);
v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32));
v.val[3] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 48));
tmp0 = _mm_unpacklo_epi8(v.val[0], v.val[1]);
tmp1 = _mm_unpacklo_epi8(v.val[2], v.val[3]);
tmp2 = _mm_unpackhi_epi8(v.val[0], v.val[1]);
tmp3 = _mm_unpackhi_epi8(v.val[2], v.val[3]);
v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2);
v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2);
v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3);
v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3);
tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2]);
tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2]);
tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3]);
tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3]);
v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2);
v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2);
v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3);
v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3);
return v;
#elif defined(GI_RVV_INTRINSICS)
return vlseg4e8_v_i8m1x4((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
#else
GI_INT8_V4_t ret;
const int8_t* ptr = (int8_t*)Buffer;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
ret.val[0][i] = ptr[4 * i];
ret.val[1][i] = ptr[4 * i + 1];
ret.val[2][i] = ptr[4 * i + 2];
ret.val[3][i] = ptr[4 * i + 3];
}
return ret;
#endif
}
GI_FORCEINLINE
void GiStoreInt32(void* Buffer, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
......
......@@ -471,7 +471,7 @@ void do_average_pooling_3x3_s2x2_gi(
int odd_offset = 0, even_offset = 0;
for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) {
auto s0 = GiLd2qFloat32(sptr + iw);
auto s0 = GiLoadUzipFloat32V2(sptr + iw);
GiStoreFloat32(even + even_offset, GiGetSubVectorFloat32V2(s0, 0));
GiStoreFloat32(odd + odd_offset, GiGetSubVectorFloat32V2(s0, 1));
even_offset += MEGDNN_SIMD_WIDTH;
......
......@@ -186,8 +186,15 @@ bool ReduceImpl::exec_optimized(
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
......
......@@ -46,8 +46,8 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef));
}
MeanReducer() = default;
void feed(const int8_t* val) {
const GI_INT8_t vval = GiLoadInt8(val);
void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); }
void feed_vector(const GI_INT8_t vval) {
const GI_INT16_t vval_low = GiMoveLowLongInt8(vval);
const GI_INT16_t vval_high = GiMoveHighLongInt8(vval);
......@@ -121,9 +121,10 @@ struct MeanReducer<dt_float32, float, float, false> {
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
MeanReducer() = default;
void feed(const float* val) {
void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }
void inline feed_vector(const GI_FLOAT32_t& val) {
res = GiFloat32Type2FixLenType(
GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res)));
GiAddFloat32(val, GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) { remain += *val; }
void post(float* dst) {
......@@ -172,31 +173,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } \
void inline feed_vector(const GI_FLOAT32_t& val) { \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), val)); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
......@@ -246,10 +247,10 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8_t vval = GiLoadInt8(val); \
void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); } \
void inline feed_vector(GI_INT8_t val) { \
res = GiInt8Type2FixLenType( \
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), val)); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
......@@ -304,32 +305,32 @@ REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
#undef REDUCER_SUM_PRODUCT_C1
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { feed_vector(GiLoadFloat32(val)); } \
void inline feed_vector(GI_FLOAT32_t val) { \
res = GiFloat32Type2FixLenType( \
Gi##_Mode##Float32(val, GiFixLenType2GiFloat32Type(res))); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
......@@ -378,15 +379,16 @@ struct SumSqrReducer<dt_float32, float, float, false> {
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32_t vval = GiLoadFloat32(val);
void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }
void inline feed_vector(GI_FLOAT32_t src) {
res = GiFloat32Type2FixLenType(GiAddFloat32(
GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res)));
GiMultiplyFloat32(src, src), GiFixLenType2GiFloat32Type(res)));
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); }
void post_remain(float* dst) { *dst = remain; }
};
/**************************************do reduce*************************/
template <typename Reducer, bool C1>
......@@ -446,6 +448,57 @@ struct Exec<Reducer, false> {
}
};
template <typename Reducer, typename dtype, size_t B>
struct ExecC1SmallB {
static void do_reduce(
const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
};
#define ImplementC1SmallB(_ctype, _gi_type, _gi_ins) \
template <typename Reducer, size_t B> \
struct ExecC1SmallB<Reducer, _ctype, B> { \
static void do_reduce( \
const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
size_t) { \
size_t a = 0; \
for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
if (B == 4) { \
GI_##_gi_type##_V4_t data_v4 = GiLoadUzip##_gi_ins##V4(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 1)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 2)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 3)); \
} \
if (B == 3) { \
GI_##_gi_type##_V3_t data_v3 = GiLoadUzip##_gi_ins##V3(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 1)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 2)); \
} \
if (B == 2) { \
GI_##_gi_type##_V2_t data_v2 = GiLoadUzip##_gi_ins##V2(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 1)); \
} \
reducer.post(dst); \
dst += Reducer::SIMD_WIDTH; \
} \
for (; a < A; a++) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
for (size_t i = 0; i < B; i++) \
reducer.feed_remain(src_ptr + i); \
reducer.post_remain(dst); \
dst++; \
} \
} \
}
ImplementC1SmallB(float, FLOAT32, Float32);
ImplementC1SmallB(int8_t, INT8, Int8);
} // namespace
// vim: syntax=cpp.doxygen
......@@ -565,7 +565,8 @@ struct ResizeAreaFastVec_SIMD_32f {
if (cn == 1) {
for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) {
GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1);
GI_FLOAT32_V2_t v_row0 = GiLoadUzipFloat32V2(S0),
v_row1 = GiLoadUzipFloat32V2(S1);
GI_FLOAT32_t v_dst0 = GiAddFloat32(
GiGetSubVectorFloat32V2(v_row0, 0),
......
......@@ -29,7 +29,7 @@ TEST_F(ARM_COMMON, REDUCE) {
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
......@@ -51,7 +51,7 @@ TEST_F(ARM_COMMON, REDUCE) {
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
......
......@@ -1356,12 +1356,12 @@ TEST_F(FALLBACK, GiSt1Float32) {
ASSERT_EQ(ret[1], s0[1]);
}
TEST_F(FALLBACK, GiLd2qFloat32) {
TEST_F(FALLBACK, GiLoadUzipFloat32) {
GI_FLOAT32_V2_t ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f};
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2);
ret = GiLd2qFloat32(s0.data());
ret = GiLoadUzipFloat32V2(s0.data());
std::vector<float> naive0;
std::vector<float> naive1;
......@@ -2590,6 +2590,61 @@ TEST_F(FALLBACK, GiLoadInt8) {
}
}
TEST_F(FALLBACK, GiLoadUzipInt8V2) {
std::vector<int8_t> s0{9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128,
127, 2, 55, 3, -1, 7, 8, -18, 17, 12, 35,
7, 8, 10, 22, -108, 27, 21, 45, 13, -11};
GI_INT8_V2_t ret;
force_memset_ret((void*)&ret, 2 * GI_SIMD_LEN_BYTE);
ret = GiLoadUzipInt8V2(s0.data());
auto p = (int8_t*)&ret;
for (size_t i = 0; i < SIMD_LEN_8; i++) {
ASSERT_EQ(p[i], s0[2 * i]);
ASSERT_EQ(p[SIMD_LEN_8 + i], s0[2 * i + 1]);
}
}
TEST_F(FALLBACK, GiLoadUzipInt8V3) {
std::vector<int8_t> s0{9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128, 127,
2, 55, 3, -1, 7, 8, -18, 17, 12, 35, 7, 8,
10, 22, -108, 27, 21, 45, 13, -11, 11, 14, -11, 12,
111, 32, 6, 9, 16, 29, -118, 67, 28, 15, 19, -10};
GI_INT8_V3_t ret;
force_memset_ret((void*)&ret, 3 * GI_SIMD_LEN_BYTE);
ret = GiLoadUzipInt8V3(s0.data());
auto p = (int8_t*)&ret;
for (size_t i = 0; i < SIMD_LEN_8; i++) {
ASSERT_EQ(p[i], s0[3 * i]);
ASSERT_EQ(p[SIMD_LEN_8 + i], s0[3 * i + 1]);
ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[3 * i + 2]);
}
}
TEST_F(FALLBACK, GiLoadUzipInt8V4) {
std::vector<int8_t> s0{
9, 2, -128, 127, 2, 45, 3, 0, 11, 2, -128, 127, 2, 55, 3, -1,
7, 8, -18, 17, 12, 35, 7, 8, 10, 22, -108, 27, 21, 45, 13, -11,
11, 14, -11, 12, 111, 32, 6, 9, 16, 29, -118, 67, 28, 15, 19, -10,
9, 4, -108, 27, 22, 43, 13, 10, 31, 12, -108, 117, 22, 25, 31, -10,
};
GI_INT8_V4_t ret;
force_memset_ret((void*)&ret, 4 * GI_SIMD_LEN_BYTE);
ret = GiLoadUzipInt8V4(s0.data());
auto p = (int8_t*)&ret;
for (size_t i = 0; i < SIMD_LEN_8; i++) {
ASSERT_EQ(p[i], s0[4 * i]);
ASSERT_EQ(p[SIMD_LEN_8 + i], s0[4 * i + 1]);
ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[4 * i + 2]);
ASSERT_EQ(p[3 * SIMD_LEN_8 + i], s0[4 * i + 3]);
}
}
TEST_F(FALLBACK, GiStoreInt32) {
GI_INT32_t src0;
std::vector<int32_t> s0{1, 2, -200, 999};
......
#include "test/fallback/fixture.h"
#include "megdnn/oprs.h"
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/task_record_check.h"
#include "test/common/tensor.h"
......@@ -27,9 +28,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t A : {1, 3, 5, 20}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
......@@ -49,9 +50,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t A : {1, 3, 5, 20}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
......@@ -301,4 +302,56 @@ TEST_F(FALLBACK, REDUCE_RECORD) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
auto run = [&]() {
Benchmarker<Reduce> benchmarker_reduce(handle());
Benchmarker<Convolution> benchmarker_conv(handle());
benchmarker_reduce.set_display(false);
benchmarker_conv.set_display(false);
constexpr size_t RUNS = 50;
benchmarker_reduce.set_times(RUNS);
benchmarker_conv.set_times(RUNS);
param::Reduce param;
param.axis = 3;
param.mode = param::Reduce::Mode::SUM;
benchmarker_reduce.set_param(param);
param::Convolution param_conv;
benchmarker_conv.set_param(param_conv);
{
TensorLayout src({24, 240, 128, 2}, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
TensorLayout conv_src({24, 2, 240, 128}, dtype::Float32());
TensorLayout conv_weight({1, 2, 1, 1}, dtype::Float32());
auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
printf("case 1: reduce use time %fms, convolution use time %fms\n", reduce,
conv);
}
{
TensorLayout src({24, 240, 128, 3}, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32());
TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32());
auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
printf("case 2: reduce use time %fms, convolution use time %fms\n", reduce,
conv);
}
{
TensorLayout src({24, 240, 128, 4}, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
TensorLayout conv_src({24, 4, 240, 128}, dtype::Float32());
TensorLayout conv_weight({1, 4, 1, 1}, dtype::Float32());
auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
printf("case 3: reduce use time %fms, convolution use time %fms\n", reduce,
conv);
}
};
run();
}
#endif
// vim: syntax=cpp.doxygen
......@@ -467,6 +467,33 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
}
#endif
TEST_F(X86, BENCHMARK_REDUCE_VS_CONV) {
auto run = [&]() {
Benchmarker<Reduce> benchmarker_reduce(handle());
Benchmarker<Convolution> benchmarker_conv(handle());
benchmarker_reduce.set_display(false);
benchmarker_conv.set_display(false);
constexpr size_t RUNS = 50;
benchmarker_reduce.set_times(RUNS);
benchmarker_conv.set_times(RUNS);
param::Reduce param;
param.axis = 3;
param.mode = param::Reduce::Mode::SUM;
benchmarker_reduce.set_param(param);
param::Convolution param_conv;
benchmarker_conv.set_param(param_conv);
TensorLayout src({24, 240, 128, 3}, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32());
TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32());
auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
printf("reduce use time %fms, convolution use time %fms\n", reduce, conv);
};
run();
}
#endif
} // namespace test
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册