Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
f12b75c0
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4704
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
f12b75c0
编写于
6月 22, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(dnn/fallback): optimize some corner case in reduce
GitOrigin-RevId: 1185594301dfb58d07b8b92212a8c7881fb204de
上级
35cf0422
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
585 addition
and
163 deletion
+585
-163
dnn/src/arm_common/reduce/opr_impl.cpp
dnn/src/arm_common/reduce/opr_impl.cpp
+99
-35
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
+32
-32
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+16
-0
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+25
-25
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+146
-0
dnn/src/fallback/pooling/gi/pooling_helper.h
dnn/src/fallback/pooling/gi/pooling_helper.h
+1
-1
dnn/src/fallback/reduce/opr_impl.cpp
dnn/src/fallback/reduce/opr_impl.cpp
+7
-0
dnn/src/fallback/reduce/reducer.h
dnn/src/fallback/reduce/reducer.h
+114
-61
dnn/src/fallback/resize/gi/resize_cv.cpp
dnn/src/fallback/resize/gi/resize_cv.cpp
+2
-1
dnn/test/arm_common/reduce.cpp
dnn/test/arm_common/reduce.cpp
+2
-2
dnn/test/fallback/gi.cpp
dnn/test/fallback/gi.cpp
+57
-2
dnn/test/fallback/reduce.cpp
dnn/test/fallback/reduce.cpp
+57
-4
dnn/test/x86/convolution.cpp
dnn/test/x86/convolution.cpp
+27
-0
未找到文件。
dnn/src/arm_common/reduce/opr_impl.cpp
浏览文件 @
f12b75c0
...
...
@@ -97,6 +97,7 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
}
MeanReducer
()
=
default
;
void
feed
(
const
ctype
*
val
)
{
res
=
vaddq_f16
(
vld1q_f16
(
val
),
res
);
}
void
feed_vector
(
const
float16x8_t
&
vval
)
{
res
=
vaddq_f16
(
vval
,
res
);
}
void
feed_remain
(
const
ctype
*
val
)
{
remain
+=
*
val
;
}
void
post
(
ctype
*
dst
)
{
res
=
vmulq_n_f16
(
res
,
coef
);
...
...
@@ -127,8 +128,8 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, false> {
vcoef
=
vdupq_n_f32
(
coef
);
}
MeanReducer
()
=
default
;
void
feed
(
const
uint8_t
*
val
)
{
const
uint8x16_t
vval
=
vld1q_u8
(
val
);
void
feed
(
const
uint8_t
*
val
)
{
feed_vector
(
vld1q_u8
(
val
));
}
void
feed_vector
(
const
uint8x16_t
&
vval
)
{
const
uint16x8_t
vval_low
=
vmovl_u8
(
vget_low_u8
(
vval
));
const
uint16x8_t
vval_high
=
vmovl_u8
(
vget_high_u8
(
vval
));
...
...
@@ -219,8 +220,8 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
remain = vdupq_n_##_stype(_init); \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) {
\
__stype##8x16_t vval = vld1q_##_stype(val);
\
void feed(const ctype* val) {
feed_vector(vld1q_##_stype(val)); }
\
void inline feed_vector(const __stype##8x16_t & vval) {
\
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
...
...
@@ -286,8 +287,8 @@ REDUCER_MAX_MIN_C1(
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) {
\
__stype vval = vld1q_##_stype(val);
\
void feed(const ctype* val) {
feed_vector(vld1q_##_stype(val)); }
\
void inline feed_vector(const __stype& vval) {
\
res = v##_mode##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
...
...
@@ -326,8 +327,8 @@ struct ProductReducer;
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const ctype* val) {
\
__stype vval = vld1q_##_stype(val);
\
void feed(const ctype* val) {
feed_vector(vld1q_##_stype(val)); }
\
void feed_vector(const __stype& vval) {
\
res = v##_act##q_##_stype(vval, res); \
} \
void feed_remain(const ctype* val) { \
...
...
@@ -435,8 +436,8 @@ struct SumSqrReducer<__fp16, __fp16, __fp16, false> {
fp16_fix_t
remain
;
SumSqrReducer
(
DType
,
size_t
cnt
)
:
remain
(
0.0
f
)
{
res
=
vdupq_n_f16
(
0.0
f
);
}
SumSqrReducer
()
=
default
;
void
feed
(
const
__fp16
*
val
)
{
float16x8_t
vval
=
vld1q_f16
(
val
);
void
feed
(
const
__fp16
*
val
)
{
feed_vector
(
vld1q_f16
(
val
));
}
void
inline
feed_vector
(
const
float16x8_t
&
vval
)
{
res
=
vaddq_f16
(
vmulq_f16
(
vval
,
vval
),
res
);
}
void
feed_remain
(
const
__fp16
*
val
)
{
remain
+=
(
*
val
)
*
(
*
val
);
}
...
...
@@ -504,6 +505,60 @@ struct Exec<Reducer, false> {
}
};
template
<
typename
Reducer
,
typename
dtype
,
size_t
B
>
struct
ExecC1SmallB
{
static
void
do_reduce
(
const
dtype
*
src
,
dtype
*
dst
,
DType
src_dtype
,
size_t
A
,
size_t
,
size_t
C
);
};
#define ImplementC1SmallB(_ctype, _simd_prefix, _simd_suffix) \
template <typename Reducer, size_t B> \
struct ExecC1SmallB<Reducer, _ctype, B> { \
static void do_reduce( \
const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
size_t) { \
size_t a = 0; \
for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
if (B == 4) { \
_simd_prefix##x4_t data_v4 = vld4q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v4.val[0]); \
reducer.feed_vector(data_v4.val[1]); \
reducer.feed_vector(data_v4.val[2]); \
reducer.feed_vector(data_v4.val[3]); \
} \
if (B == 3) { \
_simd_prefix##x3_t data_v3 = vld3q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v3.val[0]); \
reducer.feed_vector(data_v3.val[1]); \
reducer.feed_vector(data_v3.val[2]); \
} \
if (B == 2) { \
_simd_prefix##x2_t data_v2 = vld2q_##_simd_suffix(src_ptr); \
reducer.feed_vector(data_v2.val[0]); \
reducer.feed_vector(data_v2.val[1]); \
} \
reducer.post(dst); \
dst += Reducer::SIMD_WIDTH; \
} \
for (; a < A; a++) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
for (size_t i = 0; i < B; i++) \
reducer.feed_remain(src_ptr + i); \
reducer.post_remain(dst); \
dst++; \
} \
} \
}
ImplementC1SmallB
(
uint8_t
,
uint8x16
,
u8
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
ImplementC1SmallB
(
__fp16
,
float16x8
,
f16
);
#endif
}
// anonymous namespace
void
ReduceImpl
::
exec
(
...
...
@@ -513,31 +568,40 @@ void ReduceImpl::exec(
reduce
::
get_ABC
(
src
.
layout
,
A
,
B
,
C
,
param
().
axis
);
bool
execed
=
false
;
using
Mode
=
param
::
Reduce
::
Mode
;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
#define DISPATCH_FUNC(Reducer, _dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<_dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<_dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (src.layout.dtype.category() != DTypeCategory::FLOAT) { \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
} \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<_dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
}
#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \
...
...
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
浏览文件 @
f12b75c0
...
...
@@ -36,7 +36,7 @@ void conv_stride2::do_conv_2x2_stride2(
rep
(
i
,
nn
)
{
GI_FLOAT32_t
_outp
=
GiLoadFloat32
(
outptr
);
GI_FLOAT32_V2_t
_r0
=
GiL
d2qFloat3
2
(
r0
);
GI_FLOAT32_V2_t
_r0
=
GiL
oadUzipFloat32V
2
(
r0
);
GI_FLOAT32_t
_r00
=
GiGetSubVectorFloat32V2
(
_r0
,
0
);
// 0 2 4 6
GI_FLOAT32_t
_r01
=
GiGetSubVectorFloat32V2
(
_r0
,
1
);
// 1 3 5 7
...
...
@@ -44,7 +44,7 @@ void conv_stride2::do_conv_2x2_stride2(
_outp
=
GiSimdFmaLane
(
_outp
,
_r00
,
_k0123
,
0
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r01
,
_k0123
,
1
);
GI_FLOAT32_V2_t
_r1
=
GiL
d2qFloat3
2
(
r1
);
GI_FLOAT32_V2_t
_r1
=
GiL
oadUzipFloat32V
2
(
r1
);
GI_FLOAT32_t
_r10
=
GiGetSubVectorFloat32V2
(
_r1
,
0
);
GI_FLOAT32_t
_r11
=
GiGetSubVectorFloat32V2
(
_r1
,
1
);
...
...
@@ -94,8 +94,8 @@ void conv_stride2::do_conv_3x3_stride2(
rep
(
i
,
nn
)
{
GI_FLOAT32_t
_outp
=
GiLoadFloat32
(
outptr
);
GI_FLOAT32_V2_t
_r0
=
GiL
d2qFloat3
2
(
r0
);
GI_FLOAT32_V2_t
_r0n
=
GiL
d2qFloat3
2
(
r0
+
8
);
GI_FLOAT32_V2_t
_r0
=
GiL
oadUzipFloat32V
2
(
r0
);
GI_FLOAT32_V2_t
_r0n
=
GiL
oadUzipFloat32V
2
(
r0
+
8
);
GI_FLOAT32_t
_r00
=
GiGetSubVectorFloat32V2
(
_r0
,
0
);
// 0 2 4 6
GI_FLOAT32_t
_r01
=
GiGetSubVectorFloat32V2
(
_r0
,
1
);
// 1 3 5 7
...
...
@@ -106,8 +106,8 @@ void conv_stride2::do_conv_3x3_stride2(
_outp
=
GiSimdFmaLane
(
_outp
,
_r01
,
_k0123
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r02
,
_k0123
,
2
);
GI_FLOAT32_V2_t
_r1
=
GiL
d2qFloat3
2
(
r1
);
GI_FLOAT32_V2_t
_r1n
=
GiL
d2qFloat3
2
(
r1
+
8
);
GI_FLOAT32_V2_t
_r1
=
GiL
oadUzipFloat32V
2
(
r1
);
GI_FLOAT32_V2_t
_r1n
=
GiL
oadUzipFloat32V
2
(
r1
+
8
);
GI_FLOAT32_t
_r10
=
GiGetSubVectorFloat32V2
(
_r1
,
0
);
GI_FLOAT32_t
_r11
=
GiGetSubVectorFloat32V2
(
_r1
,
1
);
...
...
@@ -118,8 +118,8 @@ void conv_stride2::do_conv_3x3_stride2(
_outp
=
GiSimdFmaLane
(
_outp
,
_r11
,
_k3456
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r12
,
_k3456
,
2
);
GI_FLOAT32_V2_t
_r2
=
GiL
d2qFloat3
2
(
r2
);
GI_FLOAT32_V2_t
_r2n
=
GiL
d2qFloat3
2
(
r2
+
8
);
GI_FLOAT32_V2_t
_r2
=
GiL
oadUzipFloat32V
2
(
r2
);
GI_FLOAT32_V2_t
_r2n
=
GiL
oadUzipFloat32V
2
(
r2
+
8
);
GI_FLOAT32_t
_r20
=
GiGetSubVectorFloat32V2
(
_r2
,
0
);
GI_FLOAT32_t
_r21
=
GiGetSubVectorFloat32V2
(
_r2
,
1
);
...
...
@@ -176,8 +176,8 @@ void conv_stride2::do_conv_5x5_stride2(
rep
(
i
,
nn
)
{
GI_FLOAT32_t
_sum
=
GiLoadFloat32
(
outptr
);
GI_FLOAT32_V2_t
_r00_02461357
=
GiL
d2qFloat3
2
(
r0
);
GI_FLOAT32_V2_t
_r00nx2
=
GiL
d2qFloat3
2
(
r0
+
8
);
GI_FLOAT32_V2_t
_r00_02461357
=
GiL
oadUzipFloat32V
2
(
r0
);
GI_FLOAT32_V2_t
_r00nx2
=
GiL
oadUzipFloat32V
2
(
r0
+
8
);
GI_FLOAT32_t
_r0_8101214
=
GiGetSubVectorFloat32V2
(
_r00nx2
,
0
);
// 8 10 12 14
GI_FLOAT32_t
_r0_9111315
=
...
...
@@ -190,8 +190,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_r03
=
GiExtqFloat32
(
_r01
,
_r0_9111315
,
1
);
// 3 5 7 9
GI_FLOAT32_t
_r04
=
GiExtqFloat32
(
_r00
,
_r0_8101214
,
2
);
// 4 6 8 10
GI_FLOAT32_V2_t
_r10_02461357
=
GiL
d2qFloat3
2
(
r1
);
GI_FLOAT32_V2_t
_r10nx2
=
GiL
d2qFloat3
2
(
r1
+
8
);
GI_FLOAT32_V2_t
_r10_02461357
=
GiL
oadUzipFloat32V
2
(
r1
);
GI_FLOAT32_V2_t
_r10nx2
=
GiL
oadUzipFloat32V
2
(
r1
+
8
);
GI_FLOAT32_t
_r1_8101214
=
GiGetSubVectorFloat32V2
(
_r10nx2
,
0
);
GI_FLOAT32_t
_r1_9111315
=
GiGetSubVectorFloat32V2
(
_r10nx2
,
1
);
GI_FLOAT32_t
_r10
=
GiGetSubVectorFloat32V2
(
_r10_02461357
,
0
);
...
...
@@ -200,8 +200,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_r13
=
GiExtqFloat32
(
_r11
,
_r1_9111315
,
1
);
GI_FLOAT32_t
_r14
=
GiExtqFloat32
(
_r10
,
_r1_8101214
,
2
);
GI_FLOAT32_V2_t
_r20_02461357
=
GiL
d2qFloat3
2
(
r2
);
GI_FLOAT32_V2_t
_r20nx2
=
GiL
d2qFloat3
2
(
r2
+
8
);
GI_FLOAT32_V2_t
_r20_02461357
=
GiL
oadUzipFloat32V
2
(
r2
);
GI_FLOAT32_V2_t
_r20nx2
=
GiL
oadUzipFloat32V
2
(
r2
+
8
);
GI_FLOAT32_t
_r2_8101214
=
GiGetSubVectorFloat32V2
(
_r20nx2
,
0
);
GI_FLOAT32_t
_r2_9111315
=
GiGetSubVectorFloat32V2
(
_r20nx2
,
1
);
GI_FLOAT32_t
_r20
=
GiGetSubVectorFloat32V2
(
_r20_02461357
,
0
);
...
...
@@ -210,8 +210,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_r23
=
GiExtqFloat32
(
_r21
,
_r2_9111315
,
1
);
GI_FLOAT32_t
_r24
=
GiExtqFloat32
(
_r20
,
_r2_8101214
,
2
);
GI_FLOAT32_V2_t
_r30_02461357
=
GiL
d2qFloat3
2
(
r3
);
GI_FLOAT32_V2_t
_r30nx2
=
GiL
d2qFloat3
2
(
r3
+
8
);
GI_FLOAT32_V2_t
_r30_02461357
=
GiL
oadUzipFloat32V
2
(
r3
);
GI_FLOAT32_V2_t
_r30nx2
=
GiL
oadUzipFloat32V
2
(
r3
+
8
);
GI_FLOAT32_t
_r3_8101214
=
GiGetSubVectorFloat32V2
(
_r30nx2
,
0
);
GI_FLOAT32_t
_r3_9111315
=
GiGetSubVectorFloat32V2
(
_r30nx2
,
1
);
GI_FLOAT32_t
_r30
=
GiGetSubVectorFloat32V2
(
_r30_02461357
,
0
);
...
...
@@ -220,8 +220,8 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_r33
=
GiExtqFloat32
(
_r31
,
_r3_9111315
,
1
);
GI_FLOAT32_t
_r34
=
GiExtqFloat32
(
_r30
,
_r3_8101214
,
2
);
GI_FLOAT32_V2_t
_r40_02461357
=
GiL
d2qFloat3
2
(
r4
);
GI_FLOAT32_V2_t
_r40nx2
=
GiL
d2qFloat3
2
(
r4
+
8
);
GI_FLOAT32_V2_t
_r40_02461357
=
GiL
oadUzipFloat32V
2
(
r4
);
GI_FLOAT32_V2_t
_r40nx2
=
GiL
oadUzipFloat32V
2
(
r4
+
8
);
GI_FLOAT32_t
_r4_8101214
=
GiGetSubVectorFloat32V2
(
_r40nx2
,
0
);
GI_FLOAT32_t
_r4_9111315
=
GiGetSubVectorFloat32V2
(
_r40nx2
,
1
);
GI_FLOAT32_t
_r40
=
GiGetSubVectorFloat32V2
(
_r40_02461357
,
0
);
...
...
@@ -315,8 +315,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k0123
=
GiLoadFloat32
(
k0
);
GI_FLOAT32_t
_k4567
=
GiLoadFloat32
(
k0
+
4
);
GI_FLOAT32_V2_t
_r00_02461357
=
GiL
d2qFloat3
2
(
r0
);
GI_FLOAT32_V2_t
_r00nx2
=
GiL
d2qFloat3
2
(
r0
+
8
);
GI_FLOAT32_V2_t
_r00_02461357
=
GiL
oadUzipFloat32V
2
(
r0
);
GI_FLOAT32_V2_t
_r00nx2
=
GiL
oadUzipFloat32V
2
(
r0
+
8
);
GI_FLOAT32_t
_r0_8101214
=
GiGetSubVectorFloat32V2
(
_r00nx2
,
0
);
// 8 10 12 14
GI_FLOAT32_t
_r0_9111315
=
...
...
@@ -342,8 +342,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k78910
=
GiLoadFloat32
(
k1
);
GI_FLOAT32_t
_k11121314
=
GiLoadFloat32
(
k1
+
4
);
GI_FLOAT32_V2_t
_r10_02461357
=
GiL
d2qFloat3
2
(
r1
);
GI_FLOAT32_V2_t
_r10nx2
=
GiL
d2qFloat3
2
(
r1
+
8
);
GI_FLOAT32_V2_t
_r10_02461357
=
GiL
oadUzipFloat32V
2
(
r1
);
GI_FLOAT32_V2_t
_r10nx2
=
GiL
oadUzipFloat32V
2
(
r1
+
8
);
GI_FLOAT32_t
_r1_8101214
=
GiGetSubVectorFloat32V2
(
_r10nx2
,
0
);
GI_FLOAT32_t
_r1_9111315
=
GiGetSubVectorFloat32V2
(
_r10nx2
,
1
);
GI_FLOAT32_t
_r10
=
GiGetSubVectorFloat32V2
(
_r10_02461357
,
0
);
...
...
@@ -365,8 +365,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k14151617
=
GiLoadFloat32
(
k2
);
GI_FLOAT32_t
_k18192021
=
GiLoadFloat32
(
k2
+
4
);
GI_FLOAT32_V2_t
_r20_02461357
=
GiL
d2qFloat3
2
(
r2
);
GI_FLOAT32_V2_t
_r20nx2
=
GiL
d2qFloat3
2
(
r2
+
8
);
GI_FLOAT32_V2_t
_r20_02461357
=
GiL
oadUzipFloat32V
2
(
r2
);
GI_FLOAT32_V2_t
_r20nx2
=
GiL
oadUzipFloat32V
2
(
r2
+
8
);
GI_FLOAT32_t
_r2_8101214
=
GiGetSubVectorFloat32V2
(
_r20nx2
,
0
);
GI_FLOAT32_t
_r2_9111315
=
GiGetSubVectorFloat32V2
(
_r20nx2
,
1
);
GI_FLOAT32_t
_r20
=
GiGetSubVectorFloat32V2
(
_r20_02461357
,
0
);
...
...
@@ -388,8 +388,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k21222324
=
GiLoadFloat32
(
k3
);
GI_FLOAT32_t
_k25262728
=
GiLoadFloat32
(
k3
+
4
);
GI_FLOAT32_V2_t
_r30_02461357
=
GiL
d2qFloat3
2
(
r3
);
GI_FLOAT32_V2_t
_r30nx2
=
GiL
d2qFloat3
2
(
r3
+
8
);
GI_FLOAT32_V2_t
_r30_02461357
=
GiL
oadUzipFloat32V
2
(
r3
);
GI_FLOAT32_V2_t
_r30nx2
=
GiL
oadUzipFloat32V
2
(
r3
+
8
);
GI_FLOAT32_t
_r3_8101214
=
GiGetSubVectorFloat32V2
(
_r30nx2
,
0
);
GI_FLOAT32_t
_r3_9111315
=
GiGetSubVectorFloat32V2
(
_r30nx2
,
1
);
GI_FLOAT32_t
_r30
=
GiGetSubVectorFloat32V2
(
_r30_02461357
,
0
);
...
...
@@ -411,8 +411,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k28293031
=
GiLoadFloat32
(
k4
);
GI_FLOAT32_t
_k32333435
=
GiLoadFloat32
(
k4
+
4
);
GI_FLOAT32_V2_t
_r40_02461357
=
GiL
d2qFloat3
2
(
r4
);
GI_FLOAT32_V2_t
_r40nx2
=
GiL
d2qFloat3
2
(
r4
+
8
);
GI_FLOAT32_V2_t
_r40_02461357
=
GiL
oadUzipFloat32V
2
(
r4
);
GI_FLOAT32_V2_t
_r40nx2
=
GiL
oadUzipFloat32V
2
(
r4
+
8
);
GI_FLOAT32_t
_r4_8101214
=
GiGetSubVectorFloat32V2
(
_r40nx2
,
0
);
GI_FLOAT32_t
_r4_9111315
=
GiGetSubVectorFloat32V2
(
_r40nx2
,
1
);
GI_FLOAT32_t
_r40
=
GiGetSubVectorFloat32V2
(
_r40_02461357
,
0
);
...
...
@@ -434,8 +434,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k35363738
=
GiLoadFloat32
(
k5
);
GI_FLOAT32_t
_k39404142
=
GiLoadFloat32
(
k5
+
4
);
GI_FLOAT32_V2_t
_r50_02461357
=
GiL
d2qFloat3
2
(
r5
);
GI_FLOAT32_V2_t
_r50nx2
=
GiL
d2qFloat3
2
(
r5
+
8
);
GI_FLOAT32_V2_t
_r50_02461357
=
GiL
oadUzipFloat32V
2
(
r5
);
GI_FLOAT32_V2_t
_r50nx2
=
GiL
oadUzipFloat32V
2
(
r5
+
8
);
GI_FLOAT32_t
_r5_8101214
=
GiGetSubVectorFloat32V2
(
_r50nx2
,
0
);
GI_FLOAT32_t
_r5_9111315
=
GiGetSubVectorFloat32V2
(
_r50nx2
,
1
);
GI_FLOAT32_t
_r50
=
GiGetSubVectorFloat32V2
(
_r50_02461357
,
0
);
...
...
@@ -457,8 +457,8 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_k42434445
=
GiLoadFloat32
(
k6
);
GI_FLOAT32_t
_k45464748
=
GiLoadFloat32
(
k6
+
3
);
GI_FLOAT32_V2_t
_r60_02461357
=
GiL
d2qFloat3
2
(
r6
);
GI_FLOAT32_V2_t
_r60nx2
=
GiL
d2qFloat3
2
(
r6
+
8
);
GI_FLOAT32_V2_t
_r60_02461357
=
GiL
oadUzipFloat32V
2
(
r6
);
GI_FLOAT32_V2_t
_r60nx2
=
GiL
oadUzipFloat32V
2
(
r6
+
8
);
GI_FLOAT32_t
_r6_8101214
=
GiGetSubVectorFloat32V2
(
_r60nx2
,
0
);
GI_FLOAT32_t
_r6_9111315
=
GiGetSubVectorFloat32V2
(
_r60nx2
,
1
);
GI_FLOAT32_t
_r60
=
GiGetSubVectorFloat32V2
(
_r60_02461357
,
0
);
...
...
dnn/src/fallback/general_intrinsic/gi_common.h
浏览文件 @
f12b75c0
...
...
@@ -151,6 +151,8 @@ typedef int32x4x2_t GI_INT32_V2_t;
typedef
int32x4x4_t
GI_INT32_V4_t
;
typedef
int16x8x2_t
GI_INT16_V2_t
;
typedef
int8x16x2_t
GI_INT8_V2_t
;
typedef
int8x16x3_t
GI_INT8_V3_t
;
typedef
int8x16x4_t
GI_INT8_V4_t
;
typedef
int64x2_t
GI_INT64_t
;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
...
...
@@ -302,6 +304,8 @@ typedef vint32m1x2_t GI_INT32_V2_t;
typedef
vint32m1x4_t
GI_INT32_V4_t
;
typedef
vint16m1x2_t
GI_INT16_V2_t
;
typedef
vint8m1x2_t
GI_INT8_V2_t
;
typedef
vint8m1x3_t
GI_INT8_V3_t
;
typedef
vint8m1x4_t
GI_INT8_V4_t
;
//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as
//! a workaround, we use vfloat32m1_t instead
typedef
vfloat32m1_t
float32x2_t
;
...
...
@@ -390,6 +394,14 @@ typedef struct {
GI_INT8_t
val
[
2
];
}
GI_INT8_V2_t
;
typedef
struct
{
GI_INT8_t
val
[
3
];
}
GI_INT8_V3_t
;
typedef
struct
{
GI_INT8_t
val
[
4
];
}
GI_INT8_V4_t
;
#endif
//! variable length type intrinsic can not be a member of c++ class
//! caused by can not do sizeof at build stage, for example RVV and SVE
...
...
@@ -417,6 +429,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index)
#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index)
#define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index)
#define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index)
//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s)
...
...
@@ -570,6 +584,8 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorInt16V2(s, index) s.val[index]
#define GiGetSubVectorInt8V2(s, index) s.val[index]
#define GiGetSubVectorInt8V3(s, index) s.val[index]
#define GiGetSubVectorInt8V4(s, index) s.val[index]
//! insert subvector
#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s
...
...
dnn/src/fallback/general_intrinsic/gi_float.h
浏览文件 @
f12b75c0
...
...
@@ -381,31 +381,6 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) {
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V2_t
GiLd2qFloat32
(
const
float
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld2q_f32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_V2_t
v
;
v
.
val
[
0
]
=
GiLoadFloat32
(
Buffer
);
v
.
val
[
1
]
=
GiLoadFloat32
((
Buffer
+
4
));
v
=
GiUzpqFloat32
(
v
.
val
[
0
],
v
.
val
[
1
]);
return
v
;
#elif defined(GI_RVV_INTRINSICS)
return
vlseg2e32_v_f32m1x2
(
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
));
#else
GI_FLOAT32_V2_t
ret
;
ret
.
val
[
0
][
0
]
=
Buffer
[
0
];
ret
.
val
[
0
][
1
]
=
Buffer
[
2
];
ret
.
val
[
0
][
2
]
=
Buffer
[
4
];
ret
.
val
[
0
][
3
]
=
Buffer
[
6
];
ret
.
val
[
1
][
0
]
=
Buffer
[
1
];
ret
.
val
[
1
][
1
]
=
Buffer
[
3
];
ret
.
val
[
1
][
2
]
=
Buffer
[
5
];
ret
.
val
[
1
][
3
]
=
Buffer
[
7
];
return
ret
;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GiExtqFloat32(a, b, n) vextq_f32(a, b, n)
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -1709,6 +1684,31 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V2_t
GiLoadUzipFloat32V2
(
const
float
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld2q_f32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_V2_t
v
;
v
.
val
[
0
]
=
GiLoadFloat32
(
Buffer
);
v
.
val
[
1
]
=
GiLoadFloat32
((
Buffer
+
4
));
v
=
GiUzpqFloat32
(
v
.
val
[
0
],
v
.
val
[
1
]);
return
v
;
#elif defined(GI_RVV_INTRINSICS)
return
vlseg2e32_v_f32m1x2
(
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
));
#else
GI_FLOAT32_V2_t
ret
;
ret
.
val
[
0
][
0
]
=
Buffer
[
0
];
ret
.
val
[
0
][
1
]
=
Buffer
[
2
];
ret
.
val
[
0
][
2
]
=
Buffer
[
4
];
ret
.
val
[
0
][
3
]
=
Buffer
[
6
];
ret
.
val
[
1
][
0
]
=
Buffer
[
1
];
ret
.
val
[
1
][
1
]
=
Buffer
[
3
];
ret
.
val
[
1
][
2
]
=
Buffer
[
5
];
ret
.
val
[
1
][
3
]
=
Buffer
[
7
];
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V3_t
GiLoadUzipFloat32V3
(
const
float
*
ptr
)
{
#if defined(GI_NEON_INTRINSICS)
...
...
dnn/src/fallback/general_intrinsic/gi_int.h
浏览文件 @
f12b75c0
...
...
@@ -86,6 +86,152 @@ GI_INT8_t GiLoadInt8(const void* Buffer) {
#endif
}
GI_FORCEINLINE
GI_INT8_V2_t
GiLoadUzipInt8V2
(
const
void
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld2q_s8
((
int8_t
*
)
Buffer
);
#elif defined(GI_SSE42_INTRINSICS)
GI_INT8_t
v0
,
v1
;
v0
=
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
v1
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
16
));
GI_INT8_V2_t
ret
;
v0
=
_mm_shuffle_epi8
(
v0
,
_mm_setr_epi8
(
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
));
v1
=
_mm_shuffle_epi8
(
v1
,
_mm_setr_epi8
(
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
,
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
));
ret
.
val
[
0
]
=
_mm_unpacklo_epi64
(
v0
,
v1
);
ret
.
val
[
1
]
=
_mm_unpackhi_epi64
(
v0
,
v1
);
return
ret
;
#elif defined(GI_RVV_INTRINSICS)
return
vlseg2e8_v_i8m1x2
((
int8_t
*
)
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
));
#else
int8_t
data
[
2
*
GI_SIMD_LEN_BYTE
];
const
int8_t
*
ptr
=
(
int8_t
*
)
Buffer
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
data
[
i
]
=
ptr
[
2
*
i
];
data
[
GI_SIMD_LEN_BYTE
+
i
]
=
ptr
[
2
*
i
+
1
];
}
GI_INT8_V2_t
ret
;
ret
.
val
[
0
]
=
GiLoadInt8
(
data
);
ret
.
val
[
1
]
=
GiLoadInt8
(
data
+
GI_SIMD_LEN_BYTE
);
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8_V3_t
GiLoadUzipInt8V3
(
const
void
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld3q_s8
((
int8_t
*
)
Buffer
);
#elif defined(GI_SSE42_INTRINSICS)
GI_INT8_V3_t
v
;
__m128i
tmp0
,
tmp1
,
tmp2
,
tmp3
;
static
const
int8_t
mask8_0
[
16
]
=
{
0
,
3
,
6
,
9
,
12
,
15
,
1
,
4
,
7
,
10
,
13
,
2
,
5
,
8
,
11
,
14
};
static
const
int8_t
mask8_1
[
16
]
=
{
2
,
5
,
8
,
11
,
14
,
0
,
3
,
6
,
9
,
12
,
15
,
1
,
4
,
7
,
10
,
13
};
static
const
int8_t
mask8_2
[
16
]
=
{
1
,
4
,
7
,
10
,
13
,
2
,
5
,
8
,
11
,
14
,
0
,
3
,
6
,
9
,
12
,
15
};
v
.
val
[
0
]
=
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
v
.
val
[
1
]
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
16
));
v
.
val
[
2
]
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
32
));
tmp0
=
_mm_shuffle_epi8
(
v
.
val
[
0
],
*
(
__m128i
*
)
mask8_0
);
tmp1
=
_mm_shuffle_epi8
(
v
.
val
[
1
],
*
(
__m128i
*
)
mask8_1
);
tmp2
=
_mm_shuffle_epi8
(
v
.
val
[
2
],
*
(
__m128i
*
)
mask8_2
);
tmp3
=
_mm_slli_si128
(
tmp0
,
10
);
tmp3
=
_mm_alignr_epi8
(
tmp1
,
tmp3
,
10
);
// a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
tmp3
=
_mm_slli_si128
(
tmp3
,
5
);
// 0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
tmp3
=
_mm_srli_si128
(
tmp3
,
5
);
// a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
v
.
val
[
0
]
=
_mm_slli_si128
(
tmp2
,
11
);
// 0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
v
.
val
[
0
]
=
_mm_or_si128
(
v
.
val
[
0
],
tmp3
);
tmp3
=
_mm_slli_si128
(
tmp0
,
5
);
// 0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
tmp3
=
_mm_srli_si128
(
tmp3
,
11
);
// a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
v
.
val
[
1
]
=
_mm_srli_si128
(
tmp1
,
5
);
// b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
v
.
val
[
1
]
=
_mm_slli_si128
(
v
.
val
[
1
],
5
);
// 0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
v
.
val
[
1
]
=
_mm_or_si128
(
v
.
val
[
1
],
tmp3
);
v
.
val
[
1
]
=
_mm_slli_si128
(
v
.
val
[
1
],
5
);
// 0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
v
.
val
[
1
]
=
_mm_srli_si128
(
v
.
val
[
1
],
5
);
// a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
tmp3
=
_mm_srli_si128
(
tmp2
,
5
);
// c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
tmp3
=
_mm_slli_si128
(
tmp3
,
11
);
// 0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
v
.
val
[
1
]
=
_mm_or_si128
(
v
.
val
[
1
],
tmp3
);
tmp3
=
_mm_srli_si128
(
tmp2
,
10
);
// c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
tmp3
=
_mm_slli_si128
(
tmp3
,
10
);
// 0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
v
.
val
[
2
]
=
_mm_srli_si128
(
tmp1
,
11
);
// b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
v
.
val
[
2
]
=
_mm_slli_si128
(
v
.
val
[
2
],
5
);
// 0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
v
.
val
[
2
]
=
_mm_or_si128
(
v
.
val
[
2
],
tmp3
);
// 0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
tmp0
=
_mm_srli_si128
(
tmp0
,
11
);
// a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
v
.
val
[
2
]
=
_mm_or_si128
(
v
.
val
[
2
],
tmp0
);
return
v
;
#elif defined(GI_RVV_INTRINSICS)
return
vlseg3e8_v_i8m1x3
((
int8_t
*
)
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
));
#else
int8_t
data
[
3
*
GI_SIMD_LEN_BYTE
];
const
int8_t
*
ptr
=
(
int8_t
*
)
Buffer
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
data
[
i
]
=
ptr
[
3
*
i
];
data
[
GI_SIMD_LEN_BYTE
+
i
]
=
ptr
[
3
*
i
+
1
];
data
[
2
*
GI_SIMD_LEN_BYTE
+
i
]
=
ptr
[
3
*
i
+
2
];
}
GI_INT8_V3_t
ret
;
ret
.
val
[
0
]
=
GiLoadInt8
(
data
);
ret
.
val
[
1
]
=
GiLoadInt8
(
data
+
GI_SIMD_LEN_BYTE
);
ret
.
val
[
2
]
=
GiLoadInt8
(
data
+
2
*
GI_SIMD_LEN_BYTE
);
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8_V4_t
GiLoadUzipInt8V4
(
const
void
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld4q_s8
((
int8_t
*
)
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT8_V4_t
v
;
__m128i
tmp3
,
tmp2
,
tmp1
,
tmp0
;
v
.
val
[
0
]
=
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
v
.
val
[
1
]
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
16
));
v
.
val
[
2
]
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
32
));
v
.
val
[
3
]
=
_mm_loadu_si128
((
const
__m128i
*
)((
int8_t
*
)
Buffer
+
48
));
tmp0
=
_mm_unpacklo_epi8
(
v
.
val
[
0
],
v
.
val
[
1
]);
tmp1
=
_mm_unpacklo_epi8
(
v
.
val
[
2
],
v
.
val
[
3
]);
tmp2
=
_mm_unpackhi_epi8
(
v
.
val
[
0
],
v
.
val
[
1
]);
tmp3
=
_mm_unpackhi_epi8
(
v
.
val
[
2
],
v
.
val
[
3
]);
v
.
val
[
0
]
=
_mm_unpacklo_epi8
(
tmp0
,
tmp2
);
v
.
val
[
1
]
=
_mm_unpackhi_epi8
(
tmp0
,
tmp2
);
v
.
val
[
2
]
=
_mm_unpacklo_epi8
(
tmp1
,
tmp3
);
v
.
val
[
3
]
=
_mm_unpackhi_epi8
(
tmp1
,
tmp3
);
tmp0
=
_mm_unpacklo_epi32
(
v
.
val
[
0
],
v
.
val
[
2
]);
tmp1
=
_mm_unpackhi_epi32
(
v
.
val
[
0
],
v
.
val
[
2
]);
tmp2
=
_mm_unpacklo_epi32
(
v
.
val
[
1
],
v
.
val
[
3
]);
tmp3
=
_mm_unpackhi_epi32
(
v
.
val
[
1
],
v
.
val
[
3
]);
v
.
val
[
0
]
=
_mm_unpacklo_epi8
(
tmp0
,
tmp2
);
v
.
val
[
1
]
=
_mm_unpackhi_epi8
(
tmp0
,
tmp2
);
v
.
val
[
2
]
=
_mm_unpacklo_epi8
(
tmp1
,
tmp3
);
v
.
val
[
3
]
=
_mm_unpackhi_epi8
(
tmp1
,
tmp3
);
return
v
;
#elif defined(GI_RVV_INTRINSICS)
return
vlseg4e8_v_i8m1x4
((
int8_t
*
)
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
));
#else
GI_INT8_V4_t
ret
;
const
int8_t
*
ptr
=
(
int8_t
*
)
Buffer
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
.
val
[
0
][
i
]
=
ptr
[
4
*
i
];
ret
.
val
[
1
][
i
]
=
ptr
[
4
*
i
+
1
];
ret
.
val
[
2
][
i
]
=
ptr
[
4
*
i
+
2
];
ret
.
val
[
3
][
i
]
=
ptr
[
4
*
i
+
3
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
void
GiStoreInt32
(
void
*
Buffer
,
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
...
...
dnn/src/fallback/pooling/gi/pooling_helper.h
浏览文件 @
f12b75c0
...
...
@@ -471,7 +471,7 @@ void do_average_pooling_3x3_s2x2_gi(
int
odd_offset
=
0
,
even_offset
=
0
;
for
(;
iw
+
2
*
MEGDNN_SIMD_WIDTH
<=
IW
;
iw
+=
2
*
MEGDNN_SIMD_WIDTH
)
{
auto
s0
=
GiL
d2qFloat3
2
(
sptr
+
iw
);
auto
s0
=
GiL
oadUzipFloat32V
2
(
sptr
+
iw
);
GiStoreFloat32
(
even
+
even_offset
,
GiGetSubVectorFloat32V2
(
s0
,
0
));
GiStoreFloat32
(
odd
+
odd_offset
,
GiGetSubVectorFloat32V2
(
s0
,
1
));
even_offset
+=
MEGDNN_SIMD_WIDTH
;
...
...
dnn/src/fallback/reduce/opr_impl.cpp
浏览文件 @
f12b75c0
...
...
@@ -186,8 +186,15 @@ bool ReduceImpl::exec_optimized(
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
...
...
dnn/src/fallback/reduce/reducer.h
浏览文件 @
f12b75c0
...
...
@@ -46,8 +46,8 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
vcoef
=
GiFloat32Type2FixLenType
(
GiBroadcastFloat32
(
coef
));
}
MeanReducer
()
=
default
;
void
feed
(
const
int8_t
*
val
)
{
const
GI_INT8_t
vval
=
GiLoadInt8
(
val
);
void
feed
(
const
int8_t
*
val
)
{
feed_vector
(
GiLoadInt8
(
val
));
}
void
feed_vector
(
const
GI_INT8_t
vval
)
{
const
GI_INT16_t
vval_low
=
GiMoveLowLongInt8
(
vval
);
const
GI_INT16_t
vval_high
=
GiMoveHighLongInt8
(
vval
);
...
...
@@ -121,9 +121,10 @@ struct MeanReducer<dt_float32, float, float, false> {
res
=
GiFloat32Type2FixLenType
(
GiBroadcastFloat32
(
0.0
f
));
}
MeanReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
void
feed
(
const
float
*
val
)
{
feed_vector
(
GiLoadFloat32
(
val
));
}
void
inline
feed_vector
(
const
GI_FLOAT32_t
&
val
)
{
res
=
GiFloat32Type2FixLenType
(
GiAddFloat32
(
GiLoadFloat32
(
val
)
,
GiFixLenType2GiFloat32Type
(
res
)));
GiAddFloat32
(
val
,
GiFixLenType2GiFloat32Type
(
res
)));
}
void
feed_remain
(
const
float
*
val
)
{
remain
+=
*
val
;
}
void
post
(
float
*
dst
)
{
...
...
@@ -172,31 +173,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init)
\
template <>
\
struct _mode##Reducer<dt_float32, float, float, false> {
\
using ctype = float;
\
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
\
GI_FLOAT32_FIXLEN_t res;
\
float remain;
\
_mode##Reducer(DType, size_t) {
\
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));
\
remain = _init;
\
}
\
_mode##Reducer() = default;
\
void feed(const float* val) {
\
GI_FLOAT32_t vval = GiLoadFloat32(val);
\
res = GiFloat32Type2FixLenType(
\
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), v
v
al)); \
}
\
void feed_remain(const float* val) {
\
using namespace std;
\
remain = _Mode##_NAN(*val, remain);
\
}
\
void post(float* dst) {
\
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));
\
}
\
void post_remain(float* dst) { *dst = remain; }
\
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) {
feed_vector(GiLoadFloat32(val)); }
\
void inline feed_vector(const GI_FLOAT32_t& val) {
\
res = GiFloat32Type2FixLenType( \
Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), val)); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_MAX_MIN_C
(
max
,
Max
,
std
::
numeric_limits
<
dt_float32
>::
lowest
());
...
...
@@ -246,10 +247,10 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) {
\
GI_INT8_t vval = GiLoadInt8(val);
\
void feed(const int8_t* val) {
feed_vector(GiLoadInt8(val)); }
\
void inline feed_vector(GI_INT8_t val) {
\
res = GiInt8Type2FixLenType( \
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), v
val));
\
Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), v
al));
\
} \
void feed_remain(const int8_t* val) { \
using namespace std; \
...
...
@@ -304,32 +305,32 @@ REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
REDUCER_SUM_PRODUCT_C1
(
Product
,
Multiply
,
multiplies
,
1.0
f
);
#undef REDUCER_SUM_PRODUCT_C1
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init)
\
template <>
\
struct _mode##Reducer<dt_float32, float, float, false> {
\
using ctype = float;
\
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
\
GI_FLOAT32_FIXLEN_t res;
\
float remain;
\
_mode##Reducer(DType, size_t) {
\
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));
\
remain = _init;
\
}
\
_mode##Reducer() = default;
\
void feed(const float* val) {
\
GI_FLOAT32_t vval = GiLoadFloat32(val);
\
res = GiFloat32Type2FixLenType(
\
Gi##_Mode##Float32(v
v
al, GiFixLenType2GiFloat32Type(res))); \
}
\
void feed_remain(const float* val) {
\
using namespace std;
\
auto op = _op<float>();
\
remain = op(remain, (*val));
\
}
\
void post(float* dst) {
\
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));
\
}
\
void post_remain(float* dst) { *dst = remain; }
\
#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32_FIXLEN_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \
remain = _init; \
} \
_mode##Reducer() = default; \
void feed(const float* val) {
feed_vector(GiLoadFloat32(val)); }
\
void inline feed_vector(GI_FLOAT32_t val) {
\
res = GiFloat32Type2FixLenType( \
Gi##_Mode##Float32(val, GiFixLenType2GiFloat32Type(res))); \
} \
void feed_remain(const float* val) { \
using namespace std; \
auto op = _op<float>(); \
remain = op(remain, (*val)); \
} \
void post(float* dst) { \
GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \
} \
void post_remain(float* dst) { *dst = remain; } \
}
REDUCER_SUM_PRODUCT_C
(
Sum
,
Add
,
plus
,
0.0
f
);
...
...
@@ -378,15 +379,16 @@ struct SumSqrReducer<dt_float32, float, float, false> {
res
=
GiFloat32Type2FixLenType
(
GiBroadcastFloat32
(
0.0
f
));
}
SumSqrReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
GI_FLOAT32_t
vval
=
GiLoadFloat32
(
val
);
void
feed
(
const
float
*
val
)
{
feed_vector
(
GiLoadFloat32
(
val
));
}
void
inline
feed_vector
(
GI_FLOAT32_t
src
)
{
res
=
GiFloat32Type2FixLenType
(
GiAddFloat32
(
GiMultiplyFloat32
(
vval
,
vval
),
GiFixLenType2GiFloat32Type
(
res
)));
GiMultiplyFloat32
(
src
,
src
),
GiFixLenType2GiFloat32Type
(
res
)));
}
void
feed_remain
(
const
float
*
val
)
{
remain
+=
(
*
val
)
*
(
*
val
);
}
void
post
(
float
*
dst
)
{
GiStoreFloat32
(
dst
,
GiFixLenType2GiFloat32Type
(
res
));
}
void
post_remain
(
float
*
dst
)
{
*
dst
=
remain
;
}
};
/**************************************do reduce*************************/
template
<
typename
Reducer
,
bool
C1
>
...
...
@@ -446,6 +448,57 @@ struct Exec<Reducer, false> {
}
};
template
<
typename
Reducer
,
typename
dtype
,
size_t
B
>
struct
ExecC1SmallB
{
static
void
do_reduce
(
const
dtype
*
src
,
dtype
*
dst
,
DType
src_dtype
,
size_t
A
,
size_t
,
size_t
C
);
};
#define ImplementC1SmallB(_ctype, _gi_type, _gi_ins) \
template <typename Reducer, size_t B> \
struct ExecC1SmallB<Reducer, _ctype, B> { \
static void do_reduce( \
const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
size_t) { \
size_t a = 0; \
for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
if (B == 4) { \
GI_##_gi_type##_V4_t data_v4 = GiLoadUzip##_gi_ins##V4(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 1)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 2)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 3)); \
} \
if (B == 3) { \
GI_##_gi_type##_V3_t data_v3 = GiLoadUzip##_gi_ins##V3(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 1)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 2)); \
} \
if (B == 2) { \
GI_##_gi_type##_V2_t data_v2 = GiLoadUzip##_gi_ins##V2(src_ptr); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 0)); \
reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 1)); \
} \
reducer.post(dst); \
dst += Reducer::SIMD_WIDTH; \
} \
for (; a < A; a++) { \
Reducer reducer(src_dtype, B); \
auto src_ptr = src + a * B; \
for (size_t i = 0; i < B; i++) \
reducer.feed_remain(src_ptr + i); \
reducer.post_remain(dst); \
dst++; \
} \
} \
}
ImplementC1SmallB
(
float
,
FLOAT32
,
Float32
);
ImplementC1SmallB
(
int8_t
,
INT8
,
Int8
);
}
// namespace
// vim: syntax=cpp.doxygen
dnn/src/fallback/resize/gi/resize_cv.cpp
浏览文件 @
f12b75c0
...
...
@@ -565,7 +565,8 @@ struct ResizeAreaFastVec_SIMD_32f {
if
(
cn
==
1
)
{
for
(;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
{
GI_FLOAT32_V2_t
v_row0
=
GiLd2qFloat32
(
S0
),
v_row1
=
GiLd2qFloat32
(
S1
);
GI_FLOAT32_V2_t
v_row0
=
GiLoadUzipFloat32V2
(
S0
),
v_row1
=
GiLoadUzipFloat32V2
(
S1
);
GI_FLOAT32_t
v_dst0
=
GiAddFloat32
(
GiGetSubVectorFloat32V2
(
v_row0
,
0
),
...
...
dnn/test/arm_common/reduce.cpp
浏览文件 @
f12b75c0
...
...
@@ -29,7 +29,7 @@ TEST_F(ARM_COMMON, REDUCE) {
for
(
int32_t
axis
:
{
0
,
1
,
2
})
{
for
(
size_t
A
:
{
1
,
3
,
5
})
{
for
(
size_t
B
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
2
,
3
,
4
,
6
,
9
,
16
,
33
,
45
})
{
TensorShape
shape
{
A
,
B
,
C
};
Param
param
(
mode
,
axis
);
Config
config
(
param
,
dtype
,
shape
);
...
...
@@ -51,7 +51,7 @@ TEST_F(ARM_COMMON, REDUCE) {
for
(
int32_t
axis
:
{
0
,
1
,
2
})
{
for
(
size_t
A
:
{
1
,
3
,
5
})
{
for
(
size_t
B
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
2
,
3
,
4
,
6
,
9
,
16
,
33
,
45
})
{
TensorShape
shape
{
A
,
B
,
C
};
Param
param
(
mode
,
axis
);
Config
config
(
param
,
dtype
,
shape
);
...
...
dnn/test/fallback/gi.cpp
浏览文件 @
f12b75c0
...
...
@@ -1356,12 +1356,12 @@ TEST_F(FALLBACK, GiSt1Float32) {
ASSERT_EQ
(
ret
[
1
],
s0
[
1
]);
}
TEST_F
(
FALLBACK
,
GiL
d2q
Float32
)
{
TEST_F
(
FALLBACK
,
GiL
oadUzip
Float32
)
{
GI_FLOAT32_V2_t
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
,
2312.1
f
,
345.244
f
,
3.59
f
,
-
12.8
f
};
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
*
2
);
ret
=
GiL
d2qFloat3
2
(
s0
.
data
());
ret
=
GiL
oadUzipFloat32V
2
(
s0
.
data
());
std
::
vector
<
float
>
naive0
;
std
::
vector
<
float
>
naive1
;
...
...
@@ -2590,6 +2590,61 @@ TEST_F(FALLBACK, GiLoadInt8) {
}
}
TEST_F
(
FALLBACK
,
GiLoadUzipInt8V2
)
{
std
::
vector
<
int8_t
>
s0
{
9
,
2
,
-
128
,
127
,
2
,
45
,
3
,
0
,
11
,
2
,
-
128
,
127
,
2
,
55
,
3
,
-
1
,
7
,
8
,
-
18
,
17
,
12
,
35
,
7
,
8
,
10
,
22
,
-
108
,
27
,
21
,
45
,
13
,
-
11
};
GI_INT8_V2_t
ret
;
force_memset_ret
((
void
*
)
&
ret
,
2
*
GI_SIMD_LEN_BYTE
);
ret
=
GiLoadUzipInt8V2
(
s0
.
data
());
auto
p
=
(
int8_t
*
)
&
ret
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_8
;
i
++
)
{
ASSERT_EQ
(
p
[
i
],
s0
[
2
*
i
]);
ASSERT_EQ
(
p
[
SIMD_LEN_8
+
i
],
s0
[
2
*
i
+
1
]);
}
}
TEST_F
(
FALLBACK
,
GiLoadUzipInt8V3
)
{
std
::
vector
<
int8_t
>
s0
{
9
,
2
,
-
128
,
127
,
2
,
45
,
3
,
0
,
11
,
2
,
-
128
,
127
,
2
,
55
,
3
,
-
1
,
7
,
8
,
-
18
,
17
,
12
,
35
,
7
,
8
,
10
,
22
,
-
108
,
27
,
21
,
45
,
13
,
-
11
,
11
,
14
,
-
11
,
12
,
111
,
32
,
6
,
9
,
16
,
29
,
-
118
,
67
,
28
,
15
,
19
,
-
10
};
GI_INT8_V3_t
ret
;
force_memset_ret
((
void
*
)
&
ret
,
3
*
GI_SIMD_LEN_BYTE
);
ret
=
GiLoadUzipInt8V3
(
s0
.
data
());
auto
p
=
(
int8_t
*
)
&
ret
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_8
;
i
++
)
{
ASSERT_EQ
(
p
[
i
],
s0
[
3
*
i
]);
ASSERT_EQ
(
p
[
SIMD_LEN_8
+
i
],
s0
[
3
*
i
+
1
]);
ASSERT_EQ
(
p
[
2
*
SIMD_LEN_8
+
i
],
s0
[
3
*
i
+
2
]);
}
}
TEST_F
(
FALLBACK
,
GiLoadUzipInt8V4
)
{
std
::
vector
<
int8_t
>
s0
{
9
,
2
,
-
128
,
127
,
2
,
45
,
3
,
0
,
11
,
2
,
-
128
,
127
,
2
,
55
,
3
,
-
1
,
7
,
8
,
-
18
,
17
,
12
,
35
,
7
,
8
,
10
,
22
,
-
108
,
27
,
21
,
45
,
13
,
-
11
,
11
,
14
,
-
11
,
12
,
111
,
32
,
6
,
9
,
16
,
29
,
-
118
,
67
,
28
,
15
,
19
,
-
10
,
9
,
4
,
-
108
,
27
,
22
,
43
,
13
,
10
,
31
,
12
,
-
108
,
117
,
22
,
25
,
31
,
-
10
,
};
GI_INT8_V4_t
ret
;
force_memset_ret
((
void
*
)
&
ret
,
4
*
GI_SIMD_LEN_BYTE
);
ret
=
GiLoadUzipInt8V4
(
s0
.
data
());
auto
p
=
(
int8_t
*
)
&
ret
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_8
;
i
++
)
{
ASSERT_EQ
(
p
[
i
],
s0
[
4
*
i
]);
ASSERT_EQ
(
p
[
SIMD_LEN_8
+
i
],
s0
[
4
*
i
+
1
]);
ASSERT_EQ
(
p
[
2
*
SIMD_LEN_8
+
i
],
s0
[
4
*
i
+
2
]);
ASSERT_EQ
(
p
[
3
*
SIMD_LEN_8
+
i
],
s0
[
4
*
i
+
3
]);
}
}
TEST_F
(
FALLBACK
,
GiStoreInt32
)
{
GI_INT32_t
src0
;
std
::
vector
<
int32_t
>
s0
{
1
,
2
,
-
200
,
999
};
...
...
dnn/test/fallback/reduce.cpp
浏览文件 @
f12b75c0
#include "test/fallback/fixture.h"
#include "megdnn/oprs.h"
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/task_record_check.h"
#include "test/common/tensor.h"
...
...
@@ -27,9 +28,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
dtype
::
Float32
(),
dtype
::
Float16
(),
dtype
::
QuantizedS8
(
1.3
f
),
dtype
::
Quantized8Asymm
(
1.3
f
,
static_cast
<
uint8_t
>
(
3
))})
for
(
int32_t
axis
:
{
0
,
1
,
2
})
{
for
(
size_t
A
:
{
1
,
3
,
5
})
{
for
(
size_t
A
:
{
1
,
3
,
5
,
20
})
{
for
(
size_t
B
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
2
,
3
,
4
,
6
,
9
,
16
,
33
,
45
})
{
TensorShape
shape
{
A
,
B
,
C
};
Param
param
(
mode
,
axis
);
Config
config
(
param
,
dtype
,
shape
);
...
...
@@ -49,9 +50,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
for
(
auto
mode
:
{
Mode
::
SUM
,
Mode
::
PRODUCT
,
Mode
::
SUM_SQR
})
for
(
auto
dtype
:
std
::
vector
<
DType
>
{
dtype
::
Float32
(),
dtype
::
Float16
()})
for
(
int32_t
axis
:
{
0
,
1
,
2
})
{
for
(
size_t
A
:
{
1
,
3
,
5
})
{
for
(
size_t
A
:
{
1
,
3
,
5
,
20
})
{
for
(
size_t
B
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
4
,
6
,
9
,
16
,
33
,
45
})
{
for
(
size_t
C
:
{
2
,
3
,
4
,
6
,
9
,
16
,
33
,
45
})
{
TensorShape
shape
{
A
,
B
,
C
};
Param
param
(
mode
,
axis
);
Config
config
(
param
,
dtype
,
shape
);
...
...
@@ -301,4 +302,56 @@ TEST_F(FALLBACK, REDUCE_RECORD) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
FALLBACK
,
BENCHMARK_REDUCE_VS_CONV
)
{
auto
run
=
[
&
]()
{
Benchmarker
<
Reduce
>
benchmarker_reduce
(
handle
());
Benchmarker
<
Convolution
>
benchmarker_conv
(
handle
());
benchmarker_reduce
.
set_display
(
false
);
benchmarker_conv
.
set_display
(
false
);
constexpr
size_t
RUNS
=
50
;
benchmarker_reduce
.
set_times
(
RUNS
);
benchmarker_conv
.
set_times
(
RUNS
);
param
::
Reduce
param
;
param
.
axis
=
3
;
param
.
mode
=
param
::
Reduce
::
Mode
::
SUM
;
benchmarker_reduce
.
set_param
(
param
);
param
::
Convolution
param_conv
;
benchmarker_conv
.
set_param
(
param_conv
);
{
TensorLayout
src
({
24
,
240
,
128
,
2
},
dtype
::
Float32
());
auto
reduce
=
benchmarker_reduce
.
execs
({
src
,
{}})
/
RUNS
;
TensorLayout
conv_src
({
24
,
2
,
240
,
128
},
dtype
::
Float32
());
TensorLayout
conv_weight
({
1
,
2
,
1
,
1
},
dtype
::
Float32
());
auto
conv
=
benchmarker_conv
.
execs
({
conv_src
,
conv_weight
,
{}})
/
RUNS
;
printf
(
"case 1: reduce use time %fms, convolution use time %fms
\n
"
,
reduce
,
conv
);
}
{
TensorLayout
src
({
24
,
240
,
128
,
3
},
dtype
::
Float32
());
auto
reduce
=
benchmarker_reduce
.
execs
({
src
,
{}})
/
RUNS
;
TensorLayout
conv_src
({
24
,
3
,
240
,
128
},
dtype
::
Float32
());
TensorLayout
conv_weight
({
1
,
3
,
1
,
1
},
dtype
::
Float32
());
auto
conv
=
benchmarker_conv
.
execs
({
conv_src
,
conv_weight
,
{}})
/
RUNS
;
printf
(
"case 2: reduce use time %fms, convolution use time %fms
\n
"
,
reduce
,
conv
);
}
{
TensorLayout
src
({
24
,
240
,
128
,
4
},
dtype
::
Float32
());
auto
reduce
=
benchmarker_reduce
.
execs
({
src
,
{}})
/
RUNS
;
TensorLayout
conv_src
({
24
,
4
,
240
,
128
},
dtype
::
Float32
());
TensorLayout
conv_weight
({
1
,
4
,
1
,
1
},
dtype
::
Float32
());
auto
conv
=
benchmarker_conv
.
execs
({
conv_src
,
conv_weight
,
{}})
/
RUNS
;
printf
(
"case 3: reduce use time %fms, convolution use time %fms
\n
"
,
reduce
,
conv
);
}
};
run
();
}
#endif
// vim: syntax=cpp.doxygen
dnn/test/x86/convolution.cpp
浏览文件 @
f12b75c0
...
...
@@ -467,6 +467,33 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
}
#endif
TEST_F
(
X86
,
BENCHMARK_REDUCE_VS_CONV
)
{
auto
run
=
[
&
]()
{
Benchmarker
<
Reduce
>
benchmarker_reduce
(
handle
());
Benchmarker
<
Convolution
>
benchmarker_conv
(
handle
());
benchmarker_reduce
.
set_display
(
false
);
benchmarker_conv
.
set_display
(
false
);
constexpr
size_t
RUNS
=
50
;
benchmarker_reduce
.
set_times
(
RUNS
);
benchmarker_conv
.
set_times
(
RUNS
);
param
::
Reduce
param
;
param
.
axis
=
3
;
param
.
mode
=
param
::
Reduce
::
Mode
::
SUM
;
benchmarker_reduce
.
set_param
(
param
);
param
::
Convolution
param_conv
;
benchmarker_conv
.
set_param
(
param_conv
);
TensorLayout
src
({
24
,
240
,
128
,
3
},
dtype
::
Float32
());
auto
reduce
=
benchmarker_reduce
.
execs
({
src
,
{}})
/
RUNS
;
TensorLayout
conv_src
({
24
,
3
,
240
,
128
},
dtype
::
Float32
());
TensorLayout
conv_weight
({
1
,
3
,
1
,
1
},
dtype
::
Float32
());
auto
conv
=
benchmarker_conv
.
execs
({
conv_src
,
conv_weight
,
{}})
/
RUNS
;
printf
(
"reduce use time %fms, convolution use time %fms
\n
"
,
reduce
,
conv
);
};
run
();
}
#endif
}
// namespace test
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录