提交 e34a642b 编写于 作者: M Megvii Engine Team

feat(fallback): reduce support general intrinsic

GitOrigin-RevId: f250aa7b2a145a66699636c11e5602f02693ed2a
上级 10f23778
......@@ -95,8 +95,8 @@ typedef __m128i GI_INT16;
typedef __m128i GI_INT32;
#else
typedef float GI_FLOAT32 __attribute__((vector_size(16)));
typedef uint16_t GI_UINT8 __attribute__((vector_size(16)));
typedef int16_t GI_INT8 __attribute__((vector_size(16)));
typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
typedef int8_t GI_INT8 __attribute__((vector_size(16)));
typedef int16_t GI_INT16 __attribute__((vector_size(16)));
typedef int32_t GI_INT32 __attribute__((vector_size(16)));
#endif
......@@ -119,6 +119,9 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
#define GI_SIMD_LEN_BYTE 16
#endif
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)
typedef struct {
GI_INT32 val[2];
} GI_INT32_V2;
......
......@@ -223,7 +223,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vzip1q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t zipped = vzipq_f32(Vector1, Vector2);
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
return zipped.val[0];
#elif defined(GI_SSE2_INTRINSICS)
return _mm_unpacklo_ps(Vector1, Vector2);
......@@ -243,7 +243,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vzip2q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t zipped = vzipq_f32(Vector1, Vector2);
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
return zipped.val[1];
#elif defined(GI_SSE2_INTRINSICS)
return _mm_unpackhi_ps(Vector1, Vector2);
......@@ -460,7 +460,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_max_ps(Vector1, Vector2);
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
GI_FLOAT32 max;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
max[i] = MAX_NAN(Vector1[i], Vector2[i]);
}
return max;
#else
return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2);
#endif
......@@ -473,6 +480,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
return vminq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_min_ps(Vector1, Vector2);
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
GI_FLOAT32 min;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
min[i] = MIN_NAN(Vector1[i], Vector2[i]);
}
return min;
#else
return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1);
#endif
......
......@@ -97,7 +97,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_si128((__m128i*)Buffer, Vector);
#else
for (int i = 0; i < 16; i++) {
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
Buffer[i] = Vector[i];
}
#endif
......@@ -197,7 +197,8 @@ GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) {
#elif defined(GI_SSE2_INTRINSICS)
return _mm_andnot_si128(VectorNot, Vector);
#else
return (~VectorNot) & Vector;
GI_INT8 Not = ~VectorNot;
return (Not & Vector);
#endif
}
......@@ -327,11 +328,13 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
for (int i = 0; i < 8; i++) {
data[i] = o_data[8 + i];
}
return _mm_loadu_si16(data);
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT16 ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
int8_t* data = (int8_t*)&Vector;
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = data[i + half_length];
}
return ret;
#endif
......@@ -351,10 +354,11 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
for (int i = 0; i < 8; i++) {
data[i] = o_data[i];
}
return _mm_loadu_si16(data);
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT16 ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) {
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[i];
}
return ret;
......@@ -375,11 +379,12 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
for (int i = 0; i < 4; i++) {
data[i] = o_data[4 + i];
}
return _mm_loadu_si32(data);
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT32 ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
ret[i] = Vector[GI_SIMD_LEN_BYTE / 2 + i];
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[half_length + i];
}
return ret;
#endif
......@@ -399,10 +404,11 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
for (int i = 0; i < 4; i++) {
data[i] = o_data[i];
}
return _mm_loadu_si32(data);
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT32 ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); i++) {
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[i];
}
return ret;
......@@ -414,7 +420,7 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vaddlvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
int32_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
int32x4_t sum = vpaddlq_s16(vpaddlq_s8(Vector));
return (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) +
vgetq_lane_s32(sum, 3));
#elif defined(GI_SSE42_INTRINSICS)
......@@ -431,8 +437,8 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
return (int16_t)(ret);
#elif defined(GI_SSE2_INTRINSICS)
__m64 low = GiGetLowInt8x16(Vector);
__m64 high = GiGetHighInt8x16(Vector);
__m64 low = _mm_movepi64_pi64(Vector);
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
__m128 v0 = _mm_cvtpi8_ps(low);
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
__m128 v2 = _mm_cvtpi8_ps(high);
......@@ -447,16 +453,13 @@ int16_t GiReduceAddInt8(GI_INT8 Vector) {
return (int16_t)(ret0 + ret1 + ret2 + ret3);
#else
int32_t sum = 0;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
sum += Vector[i];
}
return sum;
#endif
}
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
int8_t GiReduceMaxInt8(GI_INT8 Vector) {
#if defined(GI_NEON64_INTRINSICS)
......@@ -480,23 +483,23 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
ret = Max(_mm_extract_epi32(sum, 3), ret);
return (int8_t)ret;
#elif defined(GI_SSE2_INTRINSICS)
__m64 low = GiGetLowInt8x16(Vector);
__m64 high = GiGetHighInt8x16(Vector);
__m64 low = _mm_movepi64_pi64(Vector);
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
__m128 v0 = _mm_cvtpi8_ps(low);
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
__m128 v2 = _mm_cvtpi8_ps(high);
__m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
__m128 sum0 = _mm_add_ps(v0, v1);
__m128 sum1 = _mm_add_ps(v2, v3);
__m128 sum = _mm_add_ps(sum0, sum1);
float ret0 = _mm_cvtss_f32(sum);
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
__m128 max0 = _mm_max_ps(v0, v1);
__m128 max1 = _mm_max_ps(v2, v3);
__m128 max = _mm_max_ps(max0, max1);
float ret0 = _mm_cvtss_f32(max);
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 1, 1, 1)));
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2)));
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3)));
return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3)));
#else
int8_t max = Vector[0];
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
max = Max(max, Vector[i]);
}
return max;
......@@ -526,23 +529,23 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
ret = Min(_mm_extract_epi32(sum, 3), ret);
return (int8_t)ret;
#elif defined(GI_SSE2_INTRINSICS)
__m64 low = GiGetLowInt8x16(Vector);
__m64 high = GiGetHighInt8x16(Vector);
__m64 low = _mm_movepi64_pi64(Vector);
__m64 high = _mm_movepi64_pi64(_mm_unpackhi_epi64(Vector, Vector));
__m128 v0 = _mm_cvtpi8_ps(low);
__m128 v1 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(low, low));
__m128 v2 = _mm_cvtpi8_ps(high);
__m128 v3 = _mm_cvtpi8_ps(_mm_unpackhi_pi32(high, high));
__m128 sum0 = _mm_add_ps(v0, v1);
__m128 sum1 = _mm_add_ps(v2, v3);
__m128 sum = _mm_add_ps(sum0, sum1);
float ret0 = _mm_cvtss_f32(sum);
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2)));
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3)));
__m128 min0 = _mm_min_ps(v0, v1);
__m128 min1 = _mm_min_ps(v2, v3);
__m128 min = _mm_min_ps(min0, min1);
float ret0 = _mm_cvtss_f32(min);
float ret1 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(1, 1, 1, 1)));
float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2)));
float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3)));
return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3)));
#else
int8_t min = Vector[0];
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
min = Min(min, Vector[i]);
}
return min;
......@@ -561,8 +564,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
#if __ARM_ARCH >= 8
int32x4_t vres0 = vcvtaq_s32_f32(src);
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
return vcombine_s16(ret, ret);
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
#else
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vfhalf = vdupq_n_f32(0.5f);
......@@ -570,8 +572,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vzero), vfhalf, vfneg_half);
int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0));
int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0));
int8x8_t ret = vqmovn_s16(vcombine_s16(vqmovn_s32(mid_s16), vqmovn_s32(mid_s16)));
return vcombine_s16(ret, ret);
return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16));
#endif
#elif defined(GI_SSE42_INTRINSICS)
__m128 vfzero = _mm_set1_ps(0.f);
......
/**
* \file dnn/src/arm_common/quantized_converter.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "src/common/utils.h"
#include "src/fallback/general_intrinsic/gi_float.h"
#include "src/fallback/general_intrinsic/gi_int.h"
namespace megdnn {
namespace fallback {
struct QConverterBase {
inline static GI_INT32 vzero() { return GiBroadcastInt32(0); }
inline static GI_FLOAT32 vfzero() { return GiBroadcastFloat32(0.f); }
inline static GI_FLOAT32 vfhalf() { return GiBroadcastFloat32(0.5f); }
inline static GI_FLOAT32 vfneg_half() { return GiBroadcastFloat32(-0.5f); }
};
struct QConverter {
template <typename dst_type, typename... src_type>
static inline dst_type convert(const src_type&... src);
template <typename dst_type, typename... src_type>
static inline dst_type round(const src_type&... src);
};
template <>
inline dt_qint8 QConverter::convert(const float& src) {
return dt_qint8(saturate<int8_t, float>(std::round(src), -128, 127));
}
template <>
inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {
return dt_quint8(saturate<uint8_t, float>(std::round(src) + zp, 0, 255));
}
template <>
inline dt_qint32 QConverter::convert(const float& src) {
return dt_qint32(saturate<int32_t, float>(
std::round(src), static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max())));
}
template <>
inline GI_FLOAT32_V2 QConverter::convert(const GI_INT16& vsrc) {
GI_INT32 vhi = GiMoveHighLongInt16(vsrc);
GI_INT32 vlo = GiMoveLowLongInt16(vsrc);
return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}};
}
template <>
inline GI_INT8 QConverter::convert(const GI_FLOAT32_V2& vsrc) {
return GiCvtFromFloat32V2ToInt8(vsrc);
}
template <>
inline GI_INT8 QConverter::convert(const GI_FLOAT32& src) {
return GiCvtFromFloat32ToInt8(src);
}
template <>
inline GI_INT32 QConverter::round(const GI_FLOAT32& vsrc) {
return GiRoundAsInt32(vsrc);
}
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -14,11 +14,13 @@
#include "src/naive/handle.h"
#include "midout.h"
#include "reducer.h"
#include "src/common/reduce_helper.h"
MIDOUT_DECL(megdnn_fb_reduce_op)
MIDOUT_DECL(megdnn_fb_reduce_c)
MIDOUT_DECL(megdnn_fb_reduce_dtype)
MIDOUT_DECL(megdnn_fallback_reduce_optimized)
namespace {
......@@ -77,11 +79,20 @@ namespace fallback {
void ReduceImpl::exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
check_exec(src.layout, dst.layout, workspace.size);
if (!exec_optimized(src, dst, workspace)) {
return exec_fallback(src, dst, workspace);
}
}
void ReduceImpl::exec_fallback(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
using namespace reduce;
using Mode = Param::Mode;
check_exec(src.layout, dst.layout, workspace.size);
size_t A, B, C;
get_ABC(src.layout, A, B, C, param().axis);
#define cb_by_op(src_type, dst_type, _wtype, mode_, Op_, kern_func) \
if (param().mode == mode_) { \
typedef DTypeTrait<src_type>::ctype src_ctype; \
......@@ -176,6 +187,101 @@ void ReduceImpl::exec(
naive::ReduceForwardImpl::exec(src, dst, workspace);
}
bool ReduceImpl::exec_optimized(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis);
bool execed = false;
using Mode = param::Reduce::Mode;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
}
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \
switch (param().mode) { \
case Mode::MEAN: \
DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
break; \
case Mode::MAX: \
DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \
break; \
case Mode::MIN: \
DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \
break; \
default: \
break; \
}
#define DISPATCH_MODE_FLOAT(dtype, ctype, comp_type) \
switch (param().mode) { \
case Mode::MEAN: \
DISPATCH_FUNC(MeanReducer, dtype, ctype, comp_type); \
break; \
case Mode::MAX: \
DISPATCH_FUNC(maxReducer, dtype, ctype, ctype); \
break; \
case Mode::MIN: \
DISPATCH_FUNC(minReducer, dtype, ctype, ctype); \
break; \
case Mode::SUM: \
DISPATCH_FUNC(SumReducer, dtype, ctype, ctype); \
break; \
case Mode::SUM_SQR: \
DISPATCH_FUNC(SumSqrReducer, dtype, ctype, ctype); \
break; \
case Mode::PRODUCT: \
DISPATCH_FUNC(ProductReducer, dtype, ctype, ctype); \
break; \
default: \
break; \
}
if (src.layout.is_contiguous() &&
src.layout.dtype.category() == DTypeCategory::QUANTIZED &&
param().data_type == param::Reduce::DataType::DEFAULT) {
DType src_type = src.layout.dtype;
if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t)
}
} else if (
src.layout.is_contiguous() &&
src.layout.dtype.category() == DTypeCategory::FLOAT &&
param().data_type == param::Reduce::DataType::DEFAULT) {
DType src_type = src.layout.dtype;
if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
DISPATCH_MODE_FLOAT(dt_float32, float, float)
}
}
return execed;
#undef DISPATCH_FUNC
#undef DISPATCH_MODE_QUANTIZED
#undef DISPATCH_MODE_FLOAT
}
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -19,6 +19,10 @@ public:
using ReduceForwardImpl::ReduceForwardImpl;
void exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) override;
bool exec_optimized(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
void exec_fallback(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
};
} // namespace fallback
......
/**
* \file dnn/src/fallback/reduce/reducer.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/common/utils.h"
#include "src/fallback/general_intrinsic/gi_float.h"
#include "src/fallback/general_intrinsic/gi_int.h"
#include "src/fallback/quantized_converter.h"
using namespace megdnn;
using namespace fallback;
namespace {
/*****************************Mean Reducer***********************/
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct MeanReducer;
template <>
struct MeanReducer<dt_qint8, int8_t, int32_t, true> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
int32_t res;
float coef;
MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {}
MeanReducer() = default;
void feed(const int8_t* val) { res += GiReduceAddInt8(GiLoadInt8(val)); }
void feed_remain(const int8_t* val) { res += *val; }
void post(int8_t* dst) {
float sum = res * coef;
*dst = std::round(sum);
}
};
template <>
struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
GI_INT32 res[4];
int32_t remain;
int32_t cnt;
float coef;
GI_FLOAT32 vcoef;
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
memset(res, 0, sizeof(res));
vcoef = GiBroadcastFloat32(coef);
}
MeanReducer() = default;
void feed(const int8_t* val) {
const GI_INT8 vval = GiLoadInt8(val);
const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
const GI_INT16 vval_high = GiMoveHighLongInt8(vval);
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);
res[0] = GiAddInt32(res[0], vval_low_low);
res[1] = GiAddInt32(res[1], vval_low_high);
res[2] = GiAddInt32(res[2], vval_high_low);
res[3] = GiAddInt32(res[3], vval_high_high);
}
void feed_remain(const int8_t* val) { remain += *val; }
void post(int8_t* dst) {
for (int i = 0; i < 4; i += 2) {
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GiStoreLowInt8(
dst,
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
dst += 8;
}
}
void post_remain(int8_t* dst) {
float sum = remain * coef;
*dst = std::round(sum);
}
};
template <>
struct MeanReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
float result;
float coef;
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
res = GiBroadcastFloat32(0.0f);
}
MeanReducer() = default;
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
void feed_remain(const float* val) { result += *val; }
void post(float* dst) {
result += GiReduceAddFloat32(res);
*dst = result * coef;
}
};
template <>
struct MeanReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
float remain;
float coef;
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
res = GiBroadcastFloat32(0.0f);
}
MeanReducer() = default;
void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); }
void feed_remain(const float* val) { remain += *val; }
void post(float* dst) {
res = GiMultiplyScalerFloat32(res, coef);
GiStoreFloat32(dst, res);
}
void post_remain(float* dst) { *dst = remain * coef; }
};
/******************************max min Reducer****************************/
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct maxReducer;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct minReducer;
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(vval, res); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = Gi##_Mode##imumFloat32(vval, res); \
} \
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
}
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C1
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(vval, res); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8 vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
}
REDUCER_MAX_MIN_C1(max, Max, -128);
REDUCER_MAX_MIN_C1(min, Min, 127);
#undef REDUCER_MAX_MIN_C1
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastInt8(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(int8_t* dst) { GiStoreInt8(dst, res); } \
void post_remain(int8_t* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C(max, Max, -128);
REDUCER_MAX_MIN_C(min, Min, 127);
#undef REDUCER_MAX_MIN_C
/***************************Sum Product Reducer***************************/
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct SumReducer;
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct ProductReducer;
#define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, *val); \
} \
void post(float* dst) { \
using namespace std; \
auto op = _op<float>(); \
*dst = op(remain, GiReduce##_Mode##Float32(res)); \
} \
}
REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
#undef REDUCER_SUM_PRODUCT_C1
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
REDUCER_SUM_PRODUCT_C(Product, Multiply, multiplies, 1.0f);
#undef REDUCER_SUM_PRODUCT_C
/***************************SumSqr Reducer***************************/
template <typename dtype, typename ctype, typename comp_type, bool C1>
struct SumSqrReducer;
template <>
struct SumSqrReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
float result;
SumSqrReducer(DType, size_t cnt) : result(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = GiBroadcastFloat32(0.0f);
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) {
float vval = *val;
result += vval * vval;
}
void post(float* dst) {
result += GiReduceAddFloat32(res);
*dst = result;
}
};
template <>
struct SumSqrReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
float remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
res = GiBroadcastFloat32(0.0f);
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
void post(float* dst) { GiStoreFloat32(dst, res); }
void post_remain(float* dst) { *dst = remain; }
};
/**************************************do reduce*************************/
template <typename Reducer, bool C1>
struct Exec {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t C);
};
template <typename Reducer>
struct Exec<Reducer, true> {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t) {
size_t a = 0;
for (; a < A; a++) {
Reducer reducer0(src_dtype, B);
auto temp_src0 = src + a * B;
size_t b = 0;
for (; b + Reducer::SIMD_WIDTH <= B; b += Reducer::SIMD_WIDTH) {
reducer0.feed(temp_src0);
temp_src0 += Reducer::SIMD_WIDTH;
}
for (; b < B; b++) {
reducer0.feed_remain(temp_src0);
temp_src0++;
}
reducer0.post(dst);
dst++;
}
}
};
template <typename Reducer>
struct Exec<Reducer, false> {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t C) {
for (size_t a = 0; a < A; a++) {
size_t c = 0;
for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
Reducer reducer(src_dtype, B);
for (size_t b = 0; b < B; b++)
reducer.feed(src + c + C * b);
reducer.post(dst);
dst += Reducer::SIMD_WIDTH;
}
for (; c < C; c++) {
Reducer reducer(src_dtype, B);
for (size_t b = 0; b < B; b++)
reducer.feed_remain(src + c + C * b);
reducer.post_remain(dst);
dst++;
}
src += B * C;
}
}
};
} // namespace
// vim: syntax=cpp.doxygen
......@@ -181,7 +181,6 @@ TEST_F(ARM_COMMON, LSTM_FORWARD_RECORD) {
TEST_F(ARM_COMMON, BENCHMARK_LSTM_FORWARD) {
Benchmarker<LSTM> optimized_bench(handle());
constexpr size_t RUNS = 20;
auto run = [&](size_t hidden_size, size_t input_size) {
optimized_bench.set_times(20).set_display(true);
size_t gate_hidden_size = 4 * hidden_size;
......
......@@ -18,6 +18,75 @@
using namespace megdnn;
using namespace test;
TEST_F(FALLBACK, REDUCE_FULL) {
using Param = Reduce::Param;
using Mode = Param::Mode;
Checker<Reduce> checker(handle());
UniformIntRNG rng{INT8_MIN >> 1, INT8_MAX >> 1};
checker.set_rng(0, &rng);
struct Config {
Param param;
DType dtype;
TensorShape shape;
Config(Param param, DType dtype, TensorShape shape)
: param(param), dtype(dtype), shape(shape) {}
};
std::vector<Config> configs;
for (auto mode : {Mode::MEAN, Mode::MAX, Mode::MIN})
for (auto dtype : std::vector<DType>{
dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
configs.push_back(config);
}
}
}
}
for (auto&& config : configs) {
auto&& dtype = config.dtype;
auto&& param = config.param;
auto&& shape = config.shape;
checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
}
configs.clear();
for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
for (int32_t axis : {0, 1, 2}) {
for (size_t A : {1, 3, 5}) {
for (size_t B : {4, 6, 9, 16, 33, 45}) {
for (size_t C : {4, 6, 9, 16, 33, 45}) {
TensorShape shape{A, B, C};
Param param(mode, axis);
Config config(param, dtype, shape);
configs.push_back(config);
}
}
}
}
UniformFloatRNG rng_float(-2, 2);
checker.set_rng(0, &rng_float);
checker.set_epsilon(1e-1);
for (auto&& config : configs) {
auto&& dtype = config.dtype;
auto&& param = config.param;
auto&& shape = config.shape;
if (dtype == dtype::Float16())
checker.set_epsilon(1e-1);
else
checker.set_epsilon(1e-3);
checker.set_dtype(0, dtype).set_param(param).execs({shape, {}});
}
}
TEST_F(FALLBACK, REDUCE) {
using Param = Reduce::Param;
using Mode = Param::Mode;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册