Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
93c7e451
MegEngine
项目概览
MegEngine 天元
/
MegEngine
11 个月 前同步成功
通知
392
Star
4702
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
93c7e451
编写于
2月 23, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(arm): delete the reduant implement
GitOrigin-RevId: ff32a3dc8b33c264956c64ccf730d363057ac501
上级
e34a642b
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
28 addition
and
316 deletion
+28
-316
dnn/src/arm_common/reduce/opr_impl.cpp
dnn/src/arm_common/reduce/opr_impl.cpp
+2
-291
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+7
-1
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+7
-16
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+8
-4
dnn/src/fallback/reduce/reducer.h
dnn/src/fallback/reduce/reducer.h
+4
-4
未找到文件。
dnn/src/arm_common/reduce/opr_impl.cpp
浏览文件 @
93c7e451
...
...
@@ -39,33 +39,6 @@ typedef __fp16 fp16_fix_t;
template
<
typename
dtype
,
typename
ctype
,
typename
comp_type
,
bool
C1
>
struct
MeanReducer
;
template
<
>
struct
MeanReducer
<
dt_qint8
,
int8_t
,
int32_t
,
true
>
{
using
ctype
=
int8_t
;
static
constexpr
int
SIMD_WIDTH
=
16
;
int32_t
res
;
float
coef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
res
(
0
),
coef
(
1.0
/
cnt
)
{}
MeanReducer
()
=
default
;
void
feed
(
const
int8_t
*
val
)
{
#if MEGDNN_AARCH64
res
+=
vaddlvq_s8
(
vld1q_s8
(
val
));
#elif MEGDNN_ARMV7
auto
sum
=
vpaddlq_s16
(
vpaddlq_s8
(
vld1q_s8
(
val
)));
res
+=
(
vgetq_lane_s32
(
sum
,
0
)
+
vgetq_lane_s32
(
sum
,
1
)
+
vgetq_lane_s32
(
sum
,
2
)
+
vgetq_lane_s32
(
sum
,
3
));
#else
#error "unsupport android arch"
#endif
}
void
feed_remain
(
const
int8_t
*
val
)
{
res
+=
*
val
;
}
void
post
(
int8_t
*
dst
)
{
float
sum
=
res
*
coef
;
*
dst
=
std
::
round
(
sum
);
}
};
template
<
>
struct
MeanReducer
<
dt_quint8
,
uint8_t
,
int32_t
,
true
>
{
using
ctype
=
uint8_t
;
...
...
@@ -97,33 +70,6 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, true> {
}
};
template
<
>
struct
MeanReducer
<
dt_float32
,
float
,
float
,
true
>
{
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
4
;
float32x4_t
res
;
float
result
;
float
coef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
result
(
0.0
f
),
coef
(
1.0
/
cnt
)
{
res
=
vdupq_n_f32
(
0.0
f
);
}
MeanReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
res
=
vaddq_f32
(
vld1q_f32
(
val
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
result
+=
*
val
;
}
void
post
(
float
*
dst
)
{
#if MEGDNN_AARCH64
result
+=
vaddvq_f32
(
res
);
#elif MEGDNN_ARMV7
auto
sum_temp
=
vpadd_f32
(
vget_low_f32
(
res
),
vget_high_f32
(
res
));
result
+=
(
vget_lane_f32
(
sum_temp
,
0
)
+
vget_lane_f32
(
sum_temp
,
1
));
#else
#error "unsupport android arch"
#endif
*
dst
=
result
*
coef
;
}
};
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template
<
>
struct
MeanReducer
<
__fp16
,
__fp16
,
__fp16
,
true
>
{
...
...
@@ -170,73 +116,6 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
};
#endif
template
<
>
struct
MeanReducer
<
dt_float32
,
float
,
float
,
false
>
{
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
4
;
float32x4_t
res
;
float
remain
;
float
coef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
remain
(
0.0
f
),
coef
(
1.0
/
cnt
)
{
res
=
vdupq_n_f32
(
0.0
f
);
}
MeanReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
res
=
vaddq_f32
(
vld1q_f32
(
val
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
remain
+=
*
val
;
}
void
post
(
float
*
dst
)
{
res
=
vmulq_n_f32
(
res
,
coef
);
vst1q_f32
(
dst
,
res
);
}
void
post_remain
(
float
*
dst
)
{
*
dst
=
remain
*
coef
;
}
};
template
<
>
struct
MeanReducer
<
dt_qint8
,
int8_t
,
int32_t
,
false
>
{
using
ctype
=
int8_t
;
static
constexpr
int
SIMD_WIDTH
=
16
;
int32x4_t
res
[
4
];
int32_t
remain
;
int32_t
cnt
;
float
coef
;
float32x4_t
vcoef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
remain
(
0
),
cnt
(
cnt
),
coef
(
1.0
/
cnt
)
{
memset
(
res
,
0
,
sizeof
(
res
));
vcoef
=
vdupq_n_f32
(
coef
);
}
MeanReducer
()
=
default
;
void
feed
(
const
int8_t
*
val
)
{
const
int8x16_t
vval
=
vld1q_s8
(
val
);
const
int16x8_t
vval_low
=
vmovl_s8
(
vget_low_s8
(
vval
));
const
int16x8_t
vval_high
=
vmovl_s8
(
vget_high_s8
(
vval
));
const
int32x4_t
vval_low_low
=
vmovl_s16
(
vget_low_s16
(
vval_low
));
const
int32x4_t
vval_low_high
=
vmovl_s16
(
vget_high_s16
(
vval_low
));
const
int32x4_t
vval_high_low
=
vmovl_s16
(
vget_low_s16
(
vval_high
));
const
int32x4_t
vval_high_high
=
vmovl_s16
(
vget_high_s16
(
vval_high
));
res
[
0
]
=
vaddq_s32
(
res
[
0
],
vval_low_low
);
res
[
1
]
=
vaddq_s32
(
res
[
1
],
vval_low_high
);
res
[
2
]
=
vaddq_s32
(
res
[
2
],
vval_high_low
);
res
[
3
]
=
vaddq_s32
(
res
[
3
],
vval_high_high
);
}
void
feed_remain
(
const
int8_t
*
val
)
{
remain
+=
*
val
;
}
void
post
(
int8_t
*
dst
)
{
for
(
int
i
=
0
;
i
<
4
;
i
+=
2
)
{
float32x4_t
vitem0
=
vmulq_f32
(
vcvtq_f32_s32
(
res
[
i
]),
vcoef
);
float32x4_t
vitem1
=
vmulq_f32
(
vcvtq_f32_s32
(
res
[
i
+
1
]),
vcoef
);
vst1_s8
(
dst
,
(
QConverter
::
convert
<
int8x8_t
,
float32x4x2_t
>
({{
vitem0
,
vitem1
}})));
dst
+=
8
;
}
}
void
post_remain
(
int8_t
*
dst
)
{
float
sum
=
remain
*
coef
;
*
dst
=
std
::
round
(
sum
);
}
};
template
<
>
struct
MeanReducer
<
dt_quint8
,
uint8_t
,
int32_t
,
false
>
{
using
ctype
=
uint8_t
;
...
...
@@ -335,8 +214,6 @@ struct minReducer;
} \
}
REDUCER_MAX_MIN_C1
(
max
,
dt_qint8
,
int8_t
,
int8_t
,
s
,
int
,
-
128
);
REDUCER_MAX_MIN_C1
(
min
,
dt_qint8
,
int8_t
,
int8_t
,
s
,
int
,
127
);
REDUCER_MAX_MIN_C1
(
max
,
dt_quint8
,
uint8_t
,
uint8_t
,
u
,
uint
,
0
);
REDUCER_MAX_MIN_C1
(
min
,
dt_quint8
,
uint8_t
,
uint8_t
,
u
,
uint
,
255
);
#undef REDUCER_MAX_MIN_C1
...
...
@@ -364,45 +241,10 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
void post_remain(ctype* dst) { vst1q_lane_##_stype(dst, remain, 0); } \
}
REDUCER_MAX_MIN_C
(
max
,
dt_qint8
,
int8_t
,
int8_t
,
s8
,
int
,
-
128
);
REDUCER_MAX_MIN_C
(
min
,
dt_qint8
,
int8_t
,
int8_t
,
s8
,
int
,
127
);
REDUCER_MAX_MIN_C
(
max
,
dt_quint8
,
uint8_t
,
uint8_t
,
u8
,
uint
,
0
);
REDUCER_MAX_MIN_C
(
min
,
dt_quint8
,
uint8_t
,
uint8_t
,
u8
,
uint
,
255
);
#undef REDUCER_MAX_MIN_C
#define REDUCER_MAX_MIN_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
_mode##Reducer(DType, size_t) { res = vdupq_n_##_stype(_init); } \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
__stype vval = vdupq_n_##_stype(*val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void post(ctype* dst) { \
auto val = v##_mode##_##_stype( \
vget_low_##_stype(res), vget_high_##_stype(res)); \
using namespace std; \
*dst = _mode({vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1)}); \
} \
}
REDUCER_MAX_MIN_C1
(
max
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
std
::
numeric_limits
<
dt_float32
>::
lowest
());
REDUCER_MAX_MIN_C1
(
min
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
std
::
numeric_limits
<
dt_float32
>::
max
());
#undef REDUCER_MAX_MIN_C1
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define REDUCER_MAX_MIN_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
...
...
@@ -440,38 +282,6 @@ REDUCER_MAX_MIN_C1(
#undef REDUCER_MAX_MIN_C1
#endif
#define REDUCER_MAX_MIN_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, false> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
ctype remain; \
_mode##Reducer(DType, size_t) { \
res = vdupq_n_##_stype(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
using namespace std; \
remain = _mode(*val, remain); \
} \
void post(ctype* dst) { vst1q_##_stype(dst, res); } \
void post_remain(ctype* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C
(
max
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
std
::
numeric_limits
<
dt_float32
>::
lowest
());
REDUCER_MAX_MIN_C
(
min
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
std
::
numeric_limits
<
dt_float32
>::
max
());
#undef REDUCER_MAX_MIN_C
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define REDUCER_MAX_MIN_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
...
...
@@ -513,45 +323,6 @@ struct SumReducer;
template
<
typename
dtype
,
typename
ctype
,
typename
comp_type
,
bool
C1
>
struct
ProductReducer
;
#define REDUCER_SUM_PRODUCT_C1( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
template <> \
struct _mode##Reducer<_dtype, _ctype, _comp_type, true> { \
using ctype = _ctype; \
static constexpr int SIMD_WIDTH = _num; \
__stype res; \
ctype remain; \
_mode##Reducer(DType, size_t) { \
res = vdupq_n_##_stype(_init); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) { \
__stype vval = vld1q_##_stype(val); \
res = v##_act##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
using namespace std; \
auto op = _op<ctype>(); \
remain = op(remain, *val); \
} \
void post(ctype* dst) { \
using namespace std; \
auto val = v##_act##_##_stype( \
vget_low_##_stype(res), vget_high_##_stype(res)); \
auto op = _op<ctype>(); \
*dst = \
op(remain, \
op(vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1))); \
} \
}
REDUCER_SUM_PRODUCT_C1
(
Sum
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
0
,
add
,
plus
);
REDUCER_SUM_PRODUCT_C1
(
Product
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
1.0
f
,
mul
,
multiplies
);
#undef REDUCER_SUM_PRODUCT_C1
#define REDUCER_SUM_PRODUCT_C( \
_mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
template <> \
...
...
@@ -578,9 +349,6 @@ REDUCER_SUM_PRODUCT_C1(
void post_remain(ctype* dst) { *dst = remain; } \
}
REDUCER_SUM_PRODUCT_C
(
Sum
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
0
,
add
,
plus
);
REDUCER_SUM_PRODUCT_C
(
Product
,
dt_float32
,
float
,
float
,
f32
,
float32x4_t
,
4
,
1
,
mul
,
multiplies
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
REDUCER_SUM_PRODUCT_C
(
Sum
,
__fp16
,
__fp16
,
__fp16
,
f16
,
float16x8_t
,
8
,
0
,
add
,
plus
);
REDUCER_SUM_PRODUCT_C
(
...
...
@@ -633,59 +401,6 @@ REDUCER_SUM_PRODUCT_C1(
template
<
typename
dtype
,
typename
ctype
,
typename
comp_type
,
bool
C1
>
struct
SumSqrReducer
;
template
<
>
struct
SumSqrReducer
<
dt_float32
,
float
,
float
,
true
>
{
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
4
;
float32x4_t
res
;
float
result
;
SumSqrReducer
(
DType
,
size_t
cnt
)
:
result
(
0.0
f
)
{
MEGDNN_MARK_USED_VAR
(
cnt
);
res
=
vdupq_n_f32
(
0.0
f
);
}
SumSqrReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
float32x4_t
vval
=
vld1q_f32
(
val
);
res
=
vaddq_f32
(
vmulq_f32
(
vval
,
vval
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
float
vval
=
*
val
;
result
+=
vval
*
vval
;
}
void
post
(
float
*
dst
)
{
#if MEGDNN_AARCH64
result
+=
vaddvq_f32
(
res
);
#elif MEGDNN_ARMV7
auto
sum_temp
=
vpadd_f32
(
vget_low_f32
(
res
),
vget_high_f32
(
res
));
result
+=
(
vget_lane_f32
(
sum_temp
,
0
)
+
vget_lane_f32
(
sum_temp
,
1
));
#else
#error "unsupport android arch"
#endif
*
dst
=
result
;
}
};
template
<
>
struct
SumSqrReducer
<
dt_float32
,
float
,
float
,
false
>
{
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
4
;
float32x4_t
res
;
float
remain
;
SumSqrReducer
(
DType
,
size_t
cnt
)
:
remain
(
0.0
f
)
{
MEGDNN_MARK_USED_VAR
(
cnt
);
res
=
vdupq_n_f32
(
0.0
f
);
}
SumSqrReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
float32x4_t
vval
=
vld1q_f32
(
val
);
res
=
vaddq_f32
(
vmulq_f32
(
vval
,
vval
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
remain
+=
(
*
val
)
*
(
*
val
);
}
void
post
(
float
*
dst
)
{
vst1q_f32
(
dst
,
res
);
}
void
post_remain
(
float
*
dst
)
{
*
dst
=
remain
;
}
};
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template
<
>
struct
SumSqrReducer
<
__fp16
,
__fp16
,
__fp16
,
true
>
{
...
...
@@ -873,14 +588,12 @@ void ReduceImpl::exec(
default: \
break; \
}
if
(
src
.
layout
.
is_contiguous
()
&&
src
.
layout
.
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
&&
param
().
data_type
==
param
::
Reduce
::
DataType
::
DEFAULT
)
{
DType
src_type
=
src
.
layout
.
dtype
;
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
{
DISPATCH_MODE_QUANTIZED
(
dt_qint8
,
int8_t
,
int32_t
)
}
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Quantized8Asymm
)
{
DISPATCH_MODE_QUANTIZED
(
dt_quint8
,
uint8_t
,
int32_t
)
}
...
...
@@ -889,9 +602,7 @@ void ReduceImpl::exec(
src
.
layout
.
dtype
.
category
()
==
DTypeCategory
::
FLOAT
&&
param
().
data_type
==
param
::
Reduce
::
DataType
::
DEFAULT
)
{
DType
src_type
=
src
.
layout
.
dtype
;
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Float32
)
{
DISPATCH_MODE_FLOAT
(
dt_float32
,
float
,
float
)
}
MEGDNN_MARK_USED_VAR
(
src_type
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Float16
)
{
DNN_INC_FLOAT16
(
DISPATCH_MODE_FLOAT
(
__fp16
,
__fp16
,
__fp16
));
...
...
dnn/src/fallback/general_intrinsic/gi_common.h
浏览文件 @
93c7e451
...
...
@@ -20,13 +20,19 @@
#else
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#define GI_TARGET_ARM
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
#endif
#endif
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
#define GI_TARGET_X86
#endif
#if defined(__arm__) || defined(__aarch64__)
#define GI_TARGET_ARM
#endif
#ifdef _WIN32
...
...
dnn/src/fallback/general_intrinsic/gi_float.h
浏览文件 @
93c7e451
...
...
@@ -454,22 +454,22 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
GiAndFloat32
(
Vector2
,
Selection
),
GiAndNotFloat32
(
Selection
,
Vector1
));
}
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f32
(
Vector1
,
Vector2
);
#el
if defined(GI_SSE2_INTRINSICS)
#el
se
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
GI_FLOAT32
max
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
max
[
i
]
=
MAX_NAN
(
Vector1
[
i
],
Vector2
[
i
]);
}
return
max
;
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector1
>
Vector2
);
#endif
}
...
...
@@ -478,18 +478,14 @@ GI_FLOAT32
GiMinimumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_min_ps
(
Vector1
,
Vector2
);
#else
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
GI_FLOAT32
min
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
min
[
i
]
=
MIN_NAN
(
Vector1
[
i
],
Vector2
[
i
]);
}
return
min
;
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector2
>
Vector1
);
#endif
}
...
...
@@ -563,11 +559,6 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_VSX_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
GI_FLOAT32
(
vec_splat
((
__vector
long
long
)
Vector
,
1
)));
Vector
=
GiMaximumFloat32
(
Vector
,
vec_splat
(
Vector
,
1
));
return
Vector
[
0
];
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
...
...
@@ -577,7 +568,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
M
ax
(
ret
,
Vector
[
i
]);
ret
=
M
AX_NAN
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
...
...
@@ -602,7 +593,7 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
M
in
(
ret
,
Vector
[
i
]);
ret
=
M
IN_NAN
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
...
...
dnn/src/fallback/general_intrinsic/gi_int.h
浏览文件 @
93c7e451
...
...
@@ -416,7 +416,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
int
16
_t
GiReduceAddInt8
(
GI_INT8
Vector
)
{
int
32
_t
GiReduceAddInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vaddlvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -467,8 +467,10 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
#elif defined(GI_NEON32_INTRINSICS)
int8x8_t
VectorLow
=
vget_low_s8
(
Vector
);
int8x8_t
VectorHigh
=
vget_high_s8
(
Vector
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmax_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmax_s8
(
VectorLow
,
VectorLow
);
VectorLow
=
vpmax_s8
(
VectorLow
,
VectorLow
);
VectorLow
=
vpmax_s8
(
VectorLow
,
VectorLow
);
return
vget_lane_s8
(
VectorLow
,
0
);
#elif defined(GI_SSE42_INTRINSICS)
__m128i
v0
=
_mm_cvtepi8_epi16
(
Vector
);
...
...
@@ -514,7 +516,9 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
int8x8_t
VectorLow
=
vget_low_s8
(
Vector
);
int8x8_t
VectorHigh
=
vget_high_s8
(
Vector
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorLow
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorLow
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorLow
);
return
vget_lane_s8
(
VectorLow
,
0
);
#elif defined(GI_SSE42_INTRINSICS)
__m128i
v0
=
_mm_cvtepi8_epi16
(
Vector
);
...
...
dnn/src/fallback/reduce/reducer.h
浏览文件 @
93c7e451
...
...
@@ -145,7 +145,7 @@ struct minReducer;
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(
vval, res
); \
res = Gi##_Mode##imumFloat32(
res, vval
); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
...
...
@@ -172,7 +172,7 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(
vval, res
); \
res = Gi##_Mode##imumFloat32(
res, vval
); \
} \
void feed_remain(const float* val) { \
using namespace std; \
...
...
@@ -200,7 +200,7 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
} \
void feed_remain(const int8_t* val) { \
GI_INT8 vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(
vval, res
); \
res = Gi##_Mode##imumInt8(
res, vval
); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
}
...
...
@@ -223,7 +223,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(
vval, res
); \
res = Gi##_Mode##imumInt8(
res, vval
); \
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录