Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
58ba080d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
410
Star
4707
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
58ba080d
编写于
7月 14, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(x86/rvv): make gi conv algo adapt to vv and vf model
GitOrigin-RevId: f29593be4df167f63029893bd9cf0fb667861622
上级
bd50e457
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
799 addition
and
446 deletion
+799
-446
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
...gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
+98
-19
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
...gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
+135
-23
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
.../fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
+21
-35
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp
+250
-171
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
+178
-97
dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp
...rc/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp
+116
-98
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+1
-3
未找到文件。
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
浏览文件 @
58ba080d
...
...
@@ -24,21 +24,27 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static
MEGDNN_ALWAYS_INLINE
void
impl
(
T
&
,
T2
&
,
T3
&
)
{}
};
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane));
#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template < \
...
...
@@ -81,6 +87,7 @@ SHIFT_CAL_HELPER(4, 4);
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template
<
int
src_idx
,
int
weight_idx
,
int
c_dim
,
int
ow_block
,
int
remain_w
,
typename
T
,
...
...
@@ -145,14 +152,23 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
ic_step
)
{
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
load_helper
<
ic_step
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
(
ow_block
)
*
ic_step
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -188,19 +204,32 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
ic_step
)
{
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ic_step
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
(
ow_block
)
*
ic_step
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr
+
(
ow_block
+
1
)
*
ic_step
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
1
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -235,33 +264,54 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
ic_step
)
{
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ic_step
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
(
ow_block
)
*
ic_step
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr
+
(
ow_block
+
1
)
*
ic_step
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
1
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
2
]
=
src_ptr
+
(
ow_block
+
2
)
*
ic_step
;
#else
src
[
2
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
2
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
3
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
3
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
3
]
=
src_ptr
+
(
ow_block
+
3
)
*
ic_step
;
#else
src
[
3
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
3
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
4
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
4
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -297,45 +347,74 @@ struct KerGiXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
ic_step
)
{
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
ic_step
];
load_helper
<
ic_step
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
(
ow_block
)
*
ic_step
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr
+
(
ow_block
+
1
)
*
ic_step
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
1
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
2
]
=
src_ptr
+
(
ow_block
+
2
)
*
ic_step
;
#else
src
[
2
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
2
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
3
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
3
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
3
]
=
src_ptr
+
(
ow_block
+
3
)
*
ic_step
;
#else
src
[
3
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
3
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
4
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
4
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
4
]
=
src_ptr
+
(
ow_block
+
4
)
*
ic_step
;
#else
src
[
4
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
4
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
5
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
5
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
5
]
=
src_ptr
+
(
ow_block
+
5
)
*
ic_step
;
#else
src
[
5
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
5
)
*
ic_step
));
#endif
load_helper
<
ic_step
,
6
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
6
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
浏览文件 @
58ba080d
...
...
@@ -24,21 +24,28 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
static
MEGDNN_ALWAYS_INLINE
void
impl
(
T
&
,
T2
&
,
T3
&
)
{}
};
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][lane]), \
GiFixLenType2GiFloat32Type(src[(step + src_idx) % ow_block]), lane));
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb2(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][lane], src[(step + src_idx) % ow_block], lane));
#define cb(step, lane, ow_block) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][lane], src[(step + src_idx) % ow_block], lane));
#define SHIFT_CAL_HELPER(ow_block, remain_w) \
template < \
...
...
@@ -81,6 +88,7 @@ SHIFT_CAL_HELPER(4, 4);
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template
<
int
src_idx
,
int
weight_idx
,
int
c_dim
,
int
ow_block
,
int
remain_w
,
typename
T
,
...
...
@@ -146,15 +154,24 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
const
float
*
src_ptr_odd
=
src_ptr_odd_origin
+
ic_idx
*
ld_src_ic
;
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
4
];
/////////row 0/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
/////////row 0/////////////
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -162,12 +179,20 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
src_ptr_odd
+=
ld_src_iw
;
weight_ptr
+=
ld_weight_fh
;
/////////row 1/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -203,21 +228,34 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
const
float
*
src_ptr
=
src_ptr_origin
+
ic_idx
*
ld_src_ic
;
const
float
*
src_ptr_odd
=
src_ptr_odd_origin
+
ic_idx
*
ld_src_ic
;
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
4
];
/////////row 0/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
/////////row 0/////////////
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -225,17 +263,29 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
src_ptr_odd
+=
ld_src_iw
;
weight_ptr
+=
ld_weight_fh
;
/////////row 1/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -243,18 +293,30 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
src_ptr_odd
+=
ld_src_iw
;
weight_ptr
+=
ld_weight_fh
;
//////////row 2/////////////
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -292,30 +354,51 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
const
float
*
src_ptr_odd
=
src_ptr_odd_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
4
];
// even element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
// even element
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr
+
(
ow_block
+
1
)
*
simd_len
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
1
)
*
simd_len
));
#endif
load_helper
<
4
,
4
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
// odd element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr_odd
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr_odd
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
3
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
@@ -360,40 +443,69 @@ struct KerGiXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
const
float
*
src_ptr_odd
=
src_ptr_odd_origin
+
ic_idx
*
ld_src_ic
;
for
(
int
fh_idx
=
0
;
fh_idx
<
filter_size
;
++
fh_idx
)
{
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
GI_FLOAT32_FIXLEN_t
weight
[
c_dim
][
4
];
// even element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
const
float
*
src
[
ow_block
];
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr
,
0
);
#else
GI_FLOAT32_FIXLEN_t
src
[
ow_block
];
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr
,
0
);
#endif
// even element
load_helper
<
4
,
0
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
2
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr
+
(
ow_block
+
1
)
*
simd_len
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
1
)
*
simd_len
));
#endif
load_helper
<
4
,
4
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
2
]
=
src_ptr
+
(
ow_block
+
2
)
*
simd_len
;
#else
src
[
2
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr
+
(
ow_block
+
2
)
*
simd_len
));
#endif
load_helper
<
4
,
6
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
3
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
// odd element
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
load_ptr_helper
<
ow_block
,
0
,
simd_len
,
0
>
(
src
,
src_ptr_odd
,
0
);
#else
load_helper
<
ow_block
,
0
,
simd_len
,
0
,
Vld1qF32S
>
(
src
,
src_ptr_odd
,
0
);
#endif
load_helper
<
4
,
1
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
0
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
0
]
=
src_ptr_odd
+
ow_block
*
simd_len
;
#else
src
[
0
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr_odd
+
ow_block
*
simd_len
));
#endif
load_helper
<
4
,
3
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
1
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
src
[
1
]
=
src_ptr_odd
+
(
ow_block
+
1
)
*
simd_len
;
#else
src
[
1
]
=
GiFloat32Type2FixLenType
(
GiLoadFloat32
(
src_ptr_odd
+
(
ow_block
+
1
)
*
simd_len
));
#endif
load_helper
<
4
,
5
*
ld_weight
,
oc_step
,
c_dim
,
Vld1qF32S
>
(
weight
,
weight_ptr
,
ld_weight_oc
);
cal_helper
<
2
,
0
,
c_dim
,
ow_block
,
remain_w
>
(
c
,
src
,
weight
);
...
...
dnn/src/fallback/conv_bias/gi/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
浏览文件 @
58ba080d
...
...
@@ -40,44 +40,29 @@ struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MLA GiMultiplyAddScalarFloat32
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4))); \
c[1][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType(MLA( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
*(src[(step * stride + src_idx) / 4] + (step * stride + src_idx) % 4)));
#define MLA(a, b, c, d) \
GiMultiplyAddScalarFloat32( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), *(c + d))
#else
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4)); \
c[1][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[1][step]), \
GiFixLenType2GiFloat32Type(weight[1][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType(GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(c[0][step]), \
GiFixLenType2GiFloat32Type(weight[0][weight_idx]), \
GiFixLenType2GiFloat32Type(src[(step * stride + src_idx) / 4]), \
(step * stride + src_idx) % 4));
#undef MLA
#define MLA(a, b, c, d) \
GiSimdFmaLane( \
GiFixLenType2GiFloat32Type(a), GiFixLenType2GiFloat32Type(b), \
GiFixLenType2GiFloat32Type(c), d)
#endif
#define cb(step) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4)); \
c[1][step] = GiFloat32Type2FixLenType( \
MLA(c[1][step], weight[1][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4));
#define cb2(step) \
c[0][step] = GiFloat32Type2FixLenType( \
MLA(c[0][step], weight[0][weight_idx], src[(step * stride + src_idx) / 4], \
(step * stride + src_idx) % 4));
#define SHIFT_CAL_HELPER(ow_remain) \
template < \
int src_idx, int weight_idx, int stride, typename T, typename T2, \
...
...
@@ -108,6 +93,7 @@ SHIFT_CAL_HELPER(8)
#undef SHIFT_CAL_HELPER
#undef cb
#undef cb2
#undef MLA
template
<
int
src_idx
,
int
weight_idx
,
int
c_dim
,
int
stride
,
int
remain_w
,
typename
T
,
...
...
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride1.cpp
浏览文件 @
58ba080d
此差异已折叠。
点击以展开。
dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
浏览文件 @
58ba080d
...
...
@@ -15,6 +15,30 @@ using namespace conv_stride2;
using
NCBKernSizeParam
=
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
;
using
NCBKernParam
=
fallback
::
ConvBiasImpl
::
NCBKernParam
;
#if defined(GI_RVV_INTRINSICS)
#define PREFER_VF
#endif
#if defined(PREFER_VF)
#define MLA(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
namespace
{
GI_FORCEINLINE
void
ext_float32_ptr
(
const
float
*
a
,
const
float
*
b
,
const
int
n
,
float
*
ret
)
{
int
t_count
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
int
a_count
=
t_count
-
n
;
for
(
int
i
=
0
;
i
<
a_count
;
i
++
)
{
ret
[
i
]
=
a
[
i
+
n
];
}
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
ret
[
i
+
a_count
]
=
b
[
i
];
}
}
};
// namespace
#else
#define MLA(a, b, c, d) GiSimdFmaLane(a, b, c, d)
#endif
void
conv_stride2
::
do_conv_2x2_stride2
(
const
float
*
src
,
const
float
*
filter
,
float
*
dst
,
size_t
IH
,
size_t
IW
,
size_t
OH
,
size_t
OW
,
size_t
IC
)
{
...
...
@@ -29,7 +53,11 @@ void conv_stride2::do_conv_2x2_stride2(
const
float
*
k0
=
filter
;
#if defined(PREFER_VF)
const
float
*
_k0123
=
k0
;
#else
GI_FLOAT32_t
_k0123
=
GiLoadFloat32
(
k0
);
#endif
rep
(
h
,
OH
)
{
int
nn
=
OW
>>
2
;
...
...
@@ -41,16 +69,16 @@ void conv_stride2::do_conv_2x2_stride2(
GI_FLOAT32_t
_r00
=
GiGetSubVectorFloat32V2
(
_r0
,
0
);
// 0 2 4 6
GI_FLOAT32_t
_r01
=
GiGetSubVectorFloat32V2
(
_r0
,
1
);
// 1 3 5 7
_outp
=
GiSimdFmaLane
(
_outp
,
_r00
,
_k0123
,
0
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r01
,
_k0123
,
1
);
_outp
=
MLA
(
_outp
,
_r00
,
_k0123
,
0
);
_outp
=
MLA
(
_outp
,
_r01
,
_k0123
,
1
);
GI_FLOAT32_V2_t
_r1
=
GiLoadUzipFloat32V2
(
r1
);
GI_FLOAT32_t
_r10
=
GiGetSubVectorFloat32V2
(
_r1
,
0
);
GI_FLOAT32_t
_r11
=
GiGetSubVectorFloat32V2
(
_r1
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r10
,
_k0123
,
2
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r11
,
_k0123
,
3
);
_outp
=
MLA
(
_outp
,
_r10
,
_k0123
,
2
);
_outp
=
MLA
(
_outp
,
_r11
,
_k0123
,
3
);
GiStoreFloat32
(
outptr
,
_outp
);
...
...
@@ -84,10 +112,18 @@ void conv_stride2::do_conv_3x3_stride2(
const
float
*
k1
=
filter
+
3
;
const
float
*
k2
=
filter
+
5
;
#if defined(PREFER_VF)
const
float
*
_k0123
=
k0
;
const
float
*
_k3456
=
k1
;
const
float
*
_k5678
=
k2
;
float
_k6789
[
GI_SIMD_LEN_BYTE
/
sizeof
(
float
)];
ext_float32_ptr
(
_k5678
,
_k5678
,
1
,
_k6789
);
#else
GI_FLOAT32_t
_k0123
=
GiLoadFloat32
(
k0
);
GI_FLOAT32_t
_k3456
=
GiLoadFloat32
(
k1
);
GI_FLOAT32_t
_k5678
=
GiLoadFloat32
(
k2
);
GI_FLOAT32_t
_k6789
=
GiExtqFloat32
(
_k5678
,
_k5678
,
1
);
#endif
rep
(
h
,
OH
)
{
int
nn
=
OW
>>
2
;
...
...
@@ -102,9 +138,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t
_r02
=
GiExtqFloat32
(
_r00
,
GiGetSubVectorFloat32V2
(
_r0n
,
0
),
1
);
// 2 4 6 8
_outp
=
GiSimdFmaLane
(
_outp
,
_r00
,
_k0123
,
0
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r01
,
_k0123
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r02
,
_k0123
,
2
);
_outp
=
MLA
(
_outp
,
_r00
,
_k0123
,
0
);
_outp
=
MLA
(
_outp
,
_r01
,
_k0123
,
1
);
_outp
=
MLA
(
_outp
,
_r02
,
_k0123
,
2
);
GI_FLOAT32_V2_t
_r1
=
GiLoadUzipFloat32V2
(
r1
);
GI_FLOAT32_V2_t
_r1n
=
GiLoadUzipFloat32V2
(
r1
+
8
);
...
...
@@ -114,9 +150,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t
_r12
=
GiExtqFloat32
(
_r10
,
GiGetSubVectorFloat32V2
(
_r1n
,
0
),
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r10
,
_k3456
,
0
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r11
,
_k3456
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r12
,
_k3456
,
2
);
_outp
=
MLA
(
_outp
,
_r10
,
_k3456
,
0
);
_outp
=
MLA
(
_outp
,
_r11
,
_k3456
,
1
);
_outp
=
MLA
(
_outp
,
_r12
,
_k3456
,
2
);
GI_FLOAT32_V2_t
_r2
=
GiLoadUzipFloat32V2
(
r2
);
GI_FLOAT32_V2_t
_r2n
=
GiLoadUzipFloat32V2
(
r2
+
8
);
...
...
@@ -126,9 +162,9 @@ void conv_stride2::do_conv_3x3_stride2(
GI_FLOAT32_t
_r22
=
GiExtqFloat32
(
_r20
,
GiGetSubVectorFloat32V2
(
_r2n
,
0
),
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r20
,
_k6789
,
0
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r21
,
_k6789
,
1
);
_outp
=
GiSimdFmaLane
(
_outp
,
_r22
,
_k6789
,
2
);
_outp
=
MLA
(
_outp
,
_r20
,
_k6789
,
0
);
_outp
=
MLA
(
_outp
,
_r21
,
_k6789
,
1
);
_outp
=
MLA
(
_outp
,
_r22
,
_k6789
,
2
);
GiStoreFloat32
(
outptr
,
_outp
);
...
...
@@ -162,6 +198,15 @@ void conv_stride2::do_conv_5x5_stride2(
const
float
*
r3
=
src_ptr
+
IW
*
3
;
const
float
*
r4
=
src_ptr
+
IW
*
4
;
#if defined(PREFER_VF)
const
float
*
_k0123
=
filter
;
const
float
*
_k4567
=
filter
+
4
;
const
float
*
_k891011
=
filter
+
8
;
const
float
*
_k12131415
=
filter
+
12
;
const
float
*
_k16171819
=
filter
+
16
;
const
float
*
_k20212223
=
filter
+
20
;
const
float
*
_k24242424
=
filter
+
24
;
#else
GI_FLOAT32_t
_k0123
=
GiLoadFloat32
(
filter
);
GI_FLOAT32_t
_k4567
=
GiLoadFloat32
(
filter
+
4
);
GI_FLOAT32_t
_k891011
=
GiLoadFloat32
(
filter
+
8
);
...
...
@@ -169,6 +214,7 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_k16171819
=
GiLoadFloat32
(
filter
+
16
);
GI_FLOAT32_t
_k20212223
=
GiLoadFloat32
(
filter
+
20
);
GI_FLOAT32_t
_k24242424
=
GiBroadcastFloat32
(
filter
[
24
]);
#endif
for
(
size_t
i
=
0
;
i
<
OH
;
i
++
)
{
int
nn
=
OW
>>
2
;
...
...
@@ -230,35 +276,35 @@ void conv_stride2::do_conv_5x5_stride2(
GI_FLOAT32_t
_r43
=
GiExtqFloat32
(
_r41
,
_r4_9111315
,
1
);
GI_FLOAT32_t
_r44
=
GiExtqFloat32
(
_r40
,
_r4_8101214
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r00
,
_k0123
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r01
,
_k0123
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r02
,
_k0123
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r03
,
_k0123
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r04
,
_k4567
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r10
,
_k4567
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r11
,
_k4567
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r12
,
_k4567
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r13
,
_k891011
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r14
,
_k891011
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r20
,
_k891011
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r21
,
_k891011
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r22
,
_k12131415
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r23
,
_k12131415
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r24
,
_k12131415
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r30
,
_k12131415
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r31
,
_k16171819
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r32
,
_k16171819
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r33
,
_k16171819
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r34
,
_k16171819
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r40
,
_k20212223
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r41
,
_k20212223
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r42
,
_k20212223
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r43
,
_k20212223
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r44
,
_k24242424
,
0
);
_sum
=
MLA
(
_sum
,
_r00
,
_k0123
,
0
);
_sum
=
MLA
(
_sum
,
_r01
,
_k0123
,
1
);
_sum
=
MLA
(
_sum
,
_r02
,
_k0123
,
2
);
_sum
=
MLA
(
_sum
,
_r03
,
_k0123
,
3
);
_sum
=
MLA
(
_sum
,
_r04
,
_k4567
,
0
);
_sum
=
MLA
(
_sum
,
_r10
,
_k4567
,
1
);
_sum
=
MLA
(
_sum
,
_r11
,
_k4567
,
2
);
_sum
=
MLA
(
_sum
,
_r12
,
_k4567
,
3
);
_sum
=
MLA
(
_sum
,
_r13
,
_k891011
,
0
);
_sum
=
MLA
(
_sum
,
_r14
,
_k891011
,
1
);
_sum
=
MLA
(
_sum
,
_r20
,
_k891011
,
2
);
_sum
=
MLA
(
_sum
,
_r21
,
_k891011
,
3
);
_sum
=
MLA
(
_sum
,
_r22
,
_k12131415
,
0
);
_sum
=
MLA
(
_sum
,
_r23
,
_k12131415
,
1
);
_sum
=
MLA
(
_sum
,
_r24
,
_k12131415
,
2
);
_sum
=
MLA
(
_sum
,
_r30
,
_k12131415
,
3
);
_sum
=
MLA
(
_sum
,
_r31
,
_k16171819
,
0
);
_sum
=
MLA
(
_sum
,
_r32
,
_k16171819
,
1
);
_sum
=
MLA
(
_sum
,
_r33
,
_k16171819
,
2
);
_sum
=
MLA
(
_sum
,
_r34
,
_k16171819
,
3
);
_sum
=
MLA
(
_sum
,
_r40
,
_k20212223
,
0
);
_sum
=
MLA
(
_sum
,
_r41
,
_k20212223
,
1
);
_sum
=
MLA
(
_sum
,
_r42
,
_k20212223
,
2
);
_sum
=
MLA
(
_sum
,
_r43
,
_k20212223
,
3
);
_sum
=
MLA
(
_sum
,
_r44
,
_k24242424
,
0
);
GiStoreFloat32
(
outptr
,
_sum
);
...
...
@@ -312,8 +358,13 @@ void conv_stride2::do_conv_7x7_stride2(
rep
(
i
,
nn
)
{
GI_FLOAT32_t
_sum
=
GiLoadFloat32
(
outptr
);
#if defined(PREFER_VF)
const
float
*
_k0123
=
k0
;
const
float
*
_k4567
=
k0
+
4
;
#else
GI_FLOAT32_t
_k0123
=
GiLoadFloat32
(
k0
);
GI_FLOAT32_t
_k4567
=
GiLoadFloat32
(
k0
+
4
);
#endif
GI_FLOAT32_V2_t
_r00_02461357
=
GiLoadUzipFloat32V2
(
r0
);
GI_FLOAT32_V2_t
_r00nx2
=
GiLoadUzipFloat32V2
(
r0
+
8
);
...
...
@@ -331,16 +382,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r05
=
GiExtqFloat32
(
_r01
,
_r0_9111315
,
2
);
// 5 7 9 11
GI_FLOAT32_t
_r06
=
GiExtqFloat32
(
_r00
,
_r0_8101214
,
3
);
// 6 8 10 12
_sum
=
GiSimdFmaLane
(
_sum
,
_r00
,
_k0123
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r01
,
_k0123
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r02
,
_k0123
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r03
,
_k0123
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r04
,
_k4567
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r05
,
_k4567
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r06
,
_k4567
,
2
);
_sum
=
MLA
(
_sum
,
_r00
,
_k0123
,
0
);
_sum
=
MLA
(
_sum
,
_r01
,
_k0123
,
1
);
_sum
=
MLA
(
_sum
,
_r02
,
_k0123
,
2
);
_sum
=
MLA
(
_sum
,
_r03
,
_k0123
,
3
);
_sum
=
MLA
(
_sum
,
_r04
,
_k4567
,
0
);
_sum
=
MLA
(
_sum
,
_r05
,
_k4567
,
1
);
_sum
=
MLA
(
_sum
,
_r06
,
_k4567
,
2
);
#if defined(PREFER_VF)
const
float
*
_k78910
=
k1
;
const
float
*
_k11121314
=
k1
+
4
;
#else
GI_FLOAT32_t
_k78910
=
GiLoadFloat32
(
k1
);
GI_FLOAT32_t
_k11121314
=
GiLoadFloat32
(
k1
+
4
);
#endif
GI_FLOAT32_V2_t
_r10_02461357
=
GiLoadUzipFloat32V2
(
r1
);
GI_FLOAT32_V2_t
_r10nx2
=
GiLoadUzipFloat32V2
(
r1
+
8
);
...
...
@@ -354,16 +410,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r15
=
GiExtqFloat32
(
_r11
,
_r1_9111315
,
2
);
GI_FLOAT32_t
_r16
=
GiExtqFloat32
(
_r10
,
_r1_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r10
,
_k78910
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r11
,
_k78910
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r12
,
_k78910
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r13
,
_k78910
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r14
,
_k11121314
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r15
,
_k11121314
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r16
,
_k11121314
,
2
);
_sum
=
MLA
(
_sum
,
_r10
,
_k78910
,
0
);
_sum
=
MLA
(
_sum
,
_r11
,
_k78910
,
1
);
_sum
=
MLA
(
_sum
,
_r12
,
_k78910
,
2
);
_sum
=
MLA
(
_sum
,
_r13
,
_k78910
,
3
);
_sum
=
MLA
(
_sum
,
_r14
,
_k11121314
,
0
);
_sum
=
MLA
(
_sum
,
_r15
,
_k11121314
,
1
);
_sum
=
MLA
(
_sum
,
_r16
,
_k11121314
,
2
);
#if defined(PREFER_VF)
const
float
*
_k14151617
=
k2
;
const
float
*
_k18192021
=
k2
+
4
;
#else
GI_FLOAT32_t
_k14151617
=
GiLoadFloat32
(
k2
);
GI_FLOAT32_t
_k18192021
=
GiLoadFloat32
(
k2
+
4
);
#endif
GI_FLOAT32_V2_t
_r20_02461357
=
GiLoadUzipFloat32V2
(
r2
);
GI_FLOAT32_V2_t
_r20nx2
=
GiLoadUzipFloat32V2
(
r2
+
8
);
...
...
@@ -377,16 +438,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r25
=
GiExtqFloat32
(
_r21
,
_r2_9111315
,
2
);
GI_FLOAT32_t
_r26
=
GiExtqFloat32
(
_r20
,
_r2_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r20
,
_k14151617
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r21
,
_k14151617
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r22
,
_k14151617
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r23
,
_k14151617
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r24
,
_k18192021
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r25
,
_k18192021
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r26
,
_k18192021
,
2
);
_sum
=
MLA
(
_sum
,
_r20
,
_k14151617
,
0
);
_sum
=
MLA
(
_sum
,
_r21
,
_k14151617
,
1
);
_sum
=
MLA
(
_sum
,
_r22
,
_k14151617
,
2
);
_sum
=
MLA
(
_sum
,
_r23
,
_k14151617
,
3
);
_sum
=
MLA
(
_sum
,
_r24
,
_k18192021
,
0
);
_sum
=
MLA
(
_sum
,
_r25
,
_k18192021
,
1
);
_sum
=
MLA
(
_sum
,
_r26
,
_k18192021
,
2
);
#if defined(PREFER_VF)
const
float
*
_k21222324
=
k3
;
const
float
*
_k25262728
=
k3
+
4
;
#else
GI_FLOAT32_t
_k21222324
=
GiLoadFloat32
(
k3
);
GI_FLOAT32_t
_k25262728
=
GiLoadFloat32
(
k3
+
4
);
#endif
GI_FLOAT32_V2_t
_r30_02461357
=
GiLoadUzipFloat32V2
(
r3
);
GI_FLOAT32_V2_t
_r30nx2
=
GiLoadUzipFloat32V2
(
r3
+
8
);
...
...
@@ -400,16 +466,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r35
=
GiExtqFloat32
(
_r31
,
_r3_9111315
,
2
);
GI_FLOAT32_t
_r36
=
GiExtqFloat32
(
_r30
,
_r3_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r30
,
_k21222324
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r31
,
_k21222324
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r32
,
_k21222324
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r33
,
_k21222324
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r34
,
_k25262728
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r35
,
_k25262728
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r36
,
_k25262728
,
2
);
_sum
=
MLA
(
_sum
,
_r30
,
_k21222324
,
0
);
_sum
=
MLA
(
_sum
,
_r31
,
_k21222324
,
1
);
_sum
=
MLA
(
_sum
,
_r32
,
_k21222324
,
2
);
_sum
=
MLA
(
_sum
,
_r33
,
_k21222324
,
3
);
_sum
=
MLA
(
_sum
,
_r34
,
_k25262728
,
0
);
_sum
=
MLA
(
_sum
,
_r35
,
_k25262728
,
1
);
_sum
=
MLA
(
_sum
,
_r36
,
_k25262728
,
2
);
#if defined(PREFER_VF)
const
float
*
_k28293031
=
k4
;
const
float
*
_k32333435
=
k4
+
4
;
#else
GI_FLOAT32_t
_k28293031
=
GiLoadFloat32
(
k4
);
GI_FLOAT32_t
_k32333435
=
GiLoadFloat32
(
k4
+
4
);
#endif
GI_FLOAT32_V2_t
_r40_02461357
=
GiLoadUzipFloat32V2
(
r4
);
GI_FLOAT32_V2_t
_r40nx2
=
GiLoadUzipFloat32V2
(
r4
+
8
);
...
...
@@ -423,16 +494,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r45
=
GiExtqFloat32
(
_r41
,
_r4_9111315
,
2
);
GI_FLOAT32_t
_r46
=
GiExtqFloat32
(
_r40
,
_r4_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r40
,
_k28293031
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r41
,
_k28293031
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r42
,
_k28293031
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r43
,
_k28293031
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r44
,
_k32333435
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r45
,
_k32333435
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r46
,
_k32333435
,
2
);
_sum
=
MLA
(
_sum
,
_r40
,
_k28293031
,
0
);
_sum
=
MLA
(
_sum
,
_r41
,
_k28293031
,
1
);
_sum
=
MLA
(
_sum
,
_r42
,
_k28293031
,
2
);
_sum
=
MLA
(
_sum
,
_r43
,
_k28293031
,
3
);
_sum
=
MLA
(
_sum
,
_r44
,
_k32333435
,
0
);
_sum
=
MLA
(
_sum
,
_r45
,
_k32333435
,
1
);
_sum
=
MLA
(
_sum
,
_r46
,
_k32333435
,
2
);
#if defined(PREFER_VF)
const
float
*
_k35363738
=
k5
;
const
float
*
_k39404142
=
k5
+
4
;
#else
GI_FLOAT32_t
_k35363738
=
GiLoadFloat32
(
k5
);
GI_FLOAT32_t
_k39404142
=
GiLoadFloat32
(
k5
+
4
);
#endif
GI_FLOAT32_V2_t
_r50_02461357
=
GiLoadUzipFloat32V2
(
r5
);
GI_FLOAT32_V2_t
_r50nx2
=
GiLoadUzipFloat32V2
(
r5
+
8
);
...
...
@@ -446,16 +522,21 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r55
=
GiExtqFloat32
(
_r51
,
_r5_9111315
,
2
);
GI_FLOAT32_t
_r56
=
GiExtqFloat32
(
_r50
,
_r5_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r50
,
_k35363738
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r51
,
_k35363738
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r52
,
_k35363738
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r53
,
_k35363738
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r54
,
_k39404142
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r55
,
_k39404142
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r56
,
_k39404142
,
2
);
_sum
=
MLA
(
_sum
,
_r50
,
_k35363738
,
0
);
_sum
=
MLA
(
_sum
,
_r51
,
_k35363738
,
1
);
_sum
=
MLA
(
_sum
,
_r52
,
_k35363738
,
2
);
_sum
=
MLA
(
_sum
,
_r53
,
_k35363738
,
3
);
_sum
=
MLA
(
_sum
,
_r54
,
_k39404142
,
0
);
_sum
=
MLA
(
_sum
,
_r55
,
_k39404142
,
1
);
_sum
=
MLA
(
_sum
,
_r56
,
_k39404142
,
2
);
#if defined(PREFER_VF)
const
float
*
_k42434445
=
k6
;
const
float
*
_k45464748
=
k6
+
3
;
#else
GI_FLOAT32_t
_k42434445
=
GiLoadFloat32
(
k6
);
GI_FLOAT32_t
_k45464748
=
GiLoadFloat32
(
k6
+
3
);
#endif
GI_FLOAT32_V2_t
_r60_02461357
=
GiLoadUzipFloat32V2
(
r6
);
GI_FLOAT32_V2_t
_r60nx2
=
GiLoadUzipFloat32V2
(
r6
+
8
);
...
...
@@ -469,13 +550,13 @@ void conv_stride2::do_conv_7x7_stride2(
GI_FLOAT32_t
_r65
=
GiExtqFloat32
(
_r61
,
_r6_9111315
,
2
);
GI_FLOAT32_t
_r66
=
GiExtqFloat32
(
_r60
,
_r6_8101214
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r60
,
_k42434445
,
0
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r61
,
_k42434445
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r62
,
_k42434445
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r63
,
_k42434445
,
3
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r64
,
_k45464748
,
1
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r65
,
_k45464748
,
2
);
_sum
=
GiSimdFmaLane
(
_sum
,
_r66
,
_k45464748
,
3
);
_sum
=
MLA
(
_sum
,
_r60
,
_k42434445
,
0
);
_sum
=
MLA
(
_sum
,
_r61
,
_k42434445
,
1
);
_sum
=
MLA
(
_sum
,
_r62
,
_k42434445
,
2
);
_sum
=
MLA
(
_sum
,
_r63
,
_k42434445
,
3
);
_sum
=
MLA
(
_sum
,
_r64
,
_k45464748
,
1
);
_sum
=
MLA
(
_sum
,
_r65
,
_k45464748
,
2
);
_sum
=
MLA
(
_sum
,
_r66
,
_k45464748
,
3
);
GiStoreFloat32
(
outptr
,
_sum
);
...
...
dnn/src/fallback/conv_bias/gi/fp32/strategy_f73_mk4_nchw44.cpp
浏览文件 @
58ba080d
...
...
@@ -75,6 +75,21 @@ struct InputTransformF73_NCHW44 {
size_t
icb
=
ic
/
pack_size
;
GI_FLOAT32_t
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
d7
,
d8
;
#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
//! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
//! GiMultiplyAddScalarFloat32
#define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
const
float
*
v0
=
input_parameters
+
0
;
const
float
*
v1
=
input_parameters
+
4
;
const
float
*
v2
=
input_parameters
+
8
;
const
float
*
v3
=
input_parameters
+
12
;
const
float
*
v4
=
input_parameters
+
16
;
const
float
*
v5
=
input_parameters
+
20
;
const
float
*
v6
=
input_parameters
+
24
;
#define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d))
#else
#define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d)
#define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d)
GI_FLOAT32_t
v0
=
GiLoadFloat32
(
input_parameters
+
0
);
GI_FLOAT32_t
v1
=
GiLoadFloat32
(
input_parameters
+
4
);
GI_FLOAT32_t
v2
=
GiLoadFloat32
(
input_parameters
+
8
);
...
...
@@ -82,6 +97,7 @@ struct InputTransformF73_NCHW44 {
GI_FLOAT32_t
v4
=
GiLoadFloat32
(
input_parameters
+
16
);
GI_FLOAT32_t
v5
=
GiLoadFloat32
(
input_parameters
+
20
);
GI_FLOAT32_t
v6
=
GiLoadFloat32
(
input_parameters
+
24
);
#endif
//! B
//! 1.5 0 0 0 0 0 0 0 0
...
...
@@ -120,59 +136,59 @@ struct InputTransformF73_NCHW44 {
auto t##i##5 = d7; \
auto t##i##6 = d7; \
auto t##i##7 = d7; \
t##i##8 =
GiFmsqLaneQFloat32(t##i##8, d7, v0, 0);
\
t##i##8 =
MSUB(t##i##8, d7, v0, 0);
\
t##i##0 = GiSubtractFloat32(t##i##0, d1); \
t##i##1 =
GiFmsqLaneQFloat32(t##i##1, d1, v0, 0);
\
t##i##2 =
GiSimdFmaLane(t##i##2, d1, v0, 0);
\
t##i##3 =
GiFmsqLaneQFloat32(t##i##3, d1, v0, 1);
\
t##i##4 =
GiSimdFmaLane(t##i##4, d1, v0, 1);
\
t##i##5 =
GiFmsqLaneQFloat32(t##i##5, d1, v0, 2);
\
t##i##6 =
GiSimdFmaLane(t##i##6, d1, v0, 2);
\
t##i##1 =
MSUB(t##i##1, d1, v0, 0);
\
t##i##2 =
MADD(t##i##2, d1, v0, 0);
\
t##i##3 =
MSUB(t##i##3, d1, v0, 1);
\
t##i##4 =
MADD(t##i##4, d1, v0, 1);
\
t##i##5 =
MSUB(t##i##5, d1, v0, 2);
\
t##i##6 =
MADD(t##i##6, d1, v0, 2);
\
t##i##7 = GiSubtractFloat32(t##i##7, d1); \
t##i##8 =
GiSimdFmaLane(t##i##8, d1, v0, 0);
\
t##i##0 =
GiFmsqLaneQFloat32(t##i##0, d2, v0, 3);
\
t##i##1 =
GiFmsqLaneQFloat32(t##i##1, d2, v1, 0);
\
t##i##2 =
GiFmsqLaneQFloat32(t##i##2, d2, v1, 1);
\
t##i##3 =
GiSimdFmaLane(t##i##3, d2, v1, 2);
\
t##i##4 =
GiFmsqLaneQFloat32(t##i##4, d2, v1, 3);
\
t##i##5 =
GiFmsqLaneQFloat32(t##i##5, d2, v2, 0);
\
t##i##6 =
GiFmsqLaneQFloat32(t##i##6, d2, v2, 1);
\
t##i##8 =
MADD(t##i##8, d1, v0, 0);
\
t##i##0 =
MSUB(t##i##0, d2, v0, 3);
\
t##i##1 =
MSUB(t##i##1, d2, v1, 0);
\
t##i##2 =
MSUB(t##i##2, d2, v1, 1);
\
t##i##3 =
MADD(t##i##3, d2, v1, 2);
\
t##i##4 =
MSUB(t##i##4, d2, v1, 3);
\
t##i##5 =
MSUB(t##i##5, d2, v2, 0);
\
t##i##6 =
MSUB(t##i##6, d2, v2, 1);
\
t##i##8 = GiSubtractFloat32(t##i##8, d2); \
t##i##0 =
GiSimdFmaLane(t##i##0, d3, v2, 2);
\
t##i##1 =
GiSimdFmaLane(t##i##1, d3, v2, 3);
\
t##i##2 =
GiFmsqLaneQFloat32(t##i##2, d3, v3, 0);
\
t##i##3 =
GiSimdFmaLane(t##i##3, d3, v2, 0);
\
t##i##4 =
GiFmsqLaneQFloat32(t##i##4, d3, v3, 1);
\
t##i##5 =
GiSimdFmaLane(t##i##5, d3, v3, 2);
\
t##i##6 =
GiSimdFmaLane(t##i##6, d3, v3, 3);
\
t##i##7 =
GiSimdFmaLane(t##i##7, d3, v2, 2);
\
t##i##8 =
GiFmsqLaneQFloat32(t##i##8, d3, v0, 3);
\
t##i##0 =
GiSimdFmaLane(t##i##0, d4, v0, 3);
\
t##i##1 =
GiSimdFmaLane(t##i##1, d4, v4, 0);
\
t##i##2 =
GiSimdFmaLane(t##i##2, d4, v4, 1);
\
t##i##3 =
GiFmsqLaneQFloat32(t##i##3, d4, v4, 2);
\
t##i##4 =
GiSimdFmaLane(t##i##4, d4, v4, 3);
\
t##i##5 =
GiSimdFmaLane(t##i##5, d4, v5, 0);
\
t##i##6 =
GiSimdFmaLane(t##i##6, d4, v5, 1);
\
t##i##8 =
GiSimdFmaLane(t##i##8, d4, v2, 2);
\
t##i##0 =
GiFmsqLaneQFloat32(t##i##0, d5, v2, 2);
\
t##i##1 =
GiFmsqLaneQFloat32(t##i##1, d5, v5, 2);
\
t##i##2 =
GiFmsqLaneQFloat32(t##i##2, d5, v5, 3);
\
t##i##3 =
GiFmsqLaneQFloat32(t##i##3, d5, v6, 0);
\
t##i##4 =
GiSimdFmaLane(t##i##4, d5, v6, 1);
\
t##i##5 =
GiFmsqLaneQFloat32(t##i##5, d5, v5, 2);
\
t##i##6 =
GiFmsqLaneQFloat32(t##i##6, d5, v6, 0);
\
t##i##7 =
GiFmsqLaneQFloat32(t##i##7, d5, v2, 2);
\
t##i##8 =
GiSimdFmaLane(t##i##8, d5, v0, 3);
\
t##i##0 =
GiFmsqLaneQFloat32(t##i##0, d6, v0, 0);
\
t##i##1 =
GiFmsqLaneQFloat32(t##i##1, d6, v1, 0);
\
t##i##2 =
GiFmsqLaneQFloat32(t##i##2, d6, v1, 1);
\
t##i##3 =
GiSimdFmaLane(t##i##3, d6, v1, 0);
\
t##i##4 =
GiFmsqLaneQFloat32(t##i##4, d6, v3, 1);
\
t##i##0 =
MADD(t##i##0, d3, v2, 2);
\
t##i##1 =
MADD(t##i##1, d3, v2, 3);
\
t##i##2 =
MSUB(t##i##2, d3, v3, 0);
\
t##i##3 =
MADD(t##i##3, d3, v2, 0);
\
t##i##4 =
MSUB(t##i##4, d3, v3, 1);
\
t##i##5 =
MADD(t##i##5, d3, v3, 2);
\
t##i##6 =
MADD(t##i##6, d3, v3, 3);
\
t##i##7 =
MADD(t##i##7, d3, v2, 2);
\
t##i##8 =
MSUB(t##i##8, d3, v0, 3);
\
t##i##0 =
MADD(t##i##0, d4, v0, 3);
\
t##i##1 =
MADD(t##i##1, d4, v4, 0);
\
t##i##2 =
MADD(t##i##2, d4, v4, 1);
\
t##i##3 =
MSUB(t##i##3, d4, v4, 2);
\
t##i##4 =
MADD(t##i##4, d4, v4, 3);
\
t##i##5 =
MADD(t##i##5, d4, v5, 0);
\
t##i##6 =
MADD(t##i##6, d4, v5, 1);
\
t##i##8 =
MADD(t##i##8, d4, v2, 2);
\
t##i##0 =
MSUB(t##i##0, d5, v2, 2);
\
t##i##1 =
MSUB(t##i##1, d5, v5, 2);
\
t##i##2 =
MSUB(t##i##2, d5, v5, 3);
\
t##i##3 =
MSUB(t##i##3, d5, v6, 0);
\
t##i##4 =
MADD(t##i##4, d5, v6, 1);
\
t##i##5 =
MSUB(t##i##5, d5, v5, 2);
\
t##i##6 =
MSUB(t##i##6, d5, v6, 0);
\
t##i##7 =
MSUB(t##i##7, d5, v2, 2);
\
t##i##8 =
MADD(t##i##8, d5, v0, 3);
\
t##i##0 =
MSUB(t##i##0, d6, v0, 0);
\
t##i##1 =
MSUB(t##i##1, d6, v1, 0);
\
t##i##2 =
MSUB(t##i##2, d6, v1, 1);
\
t##i##3 =
MADD(t##i##3, d6, v1, 0);
\
t##i##4 =
MSUB(t##i##4, d6, v3, 1);
\
t##i##5 = GiSubtractFloat32(t##i##5, d6); \
t##i##6 =
GiFmsqLaneQFloat32(t##i##6, d6, v6, 2);
\
t##i##8 =
GiFmsqLaneQFloat32(t##i##8, d6, v2, 2);
\
t##i##0 =
GiSimdFmaLane
(t##i##0, d0, v0, 0);
t##i##6 =
MSUB(t##i##6, d6, v6, 2);
\
t##i##8 =
MSUB(t##i##8, d6, v2, 2);
\
t##i##0 =
MADD
(t##i##0, d0, v0, 0);
UNROLL_CALL_RAW
(
9
,
cb
);
#undef cb
...
...
@@ -187,59 +203,59 @@ struct InputTransformF73_NCHW44 {
d5 = t7##i; \
d6 = t7##i; \
d7 = t7##i; \
d8 =
GiFmsqLaneQFloat32(d8, t7##i, v0, 0);
\
d8 =
MSUB(d8, t7##i, v0, 0);
\
d0 = GiSubtractFloat32(d0, t1##i); \
d1 =
GiFmsqLaneQFloat32(d1, t1##i, v0, 0);
\
d2 =
GiSimdFmaLane(d2, t1##i, v0, 0);
\
d3 =
GiFmsqLaneQFloat32(d3, t1##i, v0, 1);
\
d4 =
GiSimdFmaLane(d4, t1##i, v0, 1);
\
d5 =
GiFmsqLaneQFloat32(d5, t1##i, v0, 2);
\
d6 =
GiSimdFmaLane(d6, t1##i, v0, 2);
\
d1 =
MSUB(d1, t1##i, v0, 0);
\
d2 =
MADD(d2, t1##i, v0, 0);
\
d3 =
MSUB(d3, t1##i, v0, 1);
\
d4 =
MADD(d4, t1##i, v0, 1);
\
d5 =
MSUB(d5, t1##i, v0, 2);
\
d6 =
MADD(d6, t1##i, v0, 2);
\
d7 = GiSubtractFloat32(d7, t1##i); \
d8 =
GiSimdFmaLane(d8, t1##i, v0, 0);
\
d0 =
GiFmsqLaneQFloat32(d0, t2##i, v0, 3);
\
d1 =
GiFmsqLaneQFloat32(d1, t2##i, v1, 0);
\
d2 =
GiFmsqLaneQFloat32(d2, t2##i, v1, 1);
\
d3 =
GiSimdFmaLane(d3, t2##i, v1, 2);
\
d4 =
GiFmsqLaneQFloat32(d4, t2##i, v1, 3);
\
d5 =
GiFmsqLaneQFloat32(d5, t2##i, v2, 0);
\
d6 =
GiFmsqLaneQFloat32(d6, t2##i, v2, 1);
\
d8 =
MADD(d8, t1##i, v0, 0);
\
d0 =
MSUB(d0, t2##i, v0, 3);
\
d1 =
MSUB(d1, t2##i, v1, 0);
\
d2 =
MSUB(d2, t2##i, v1, 1);
\
d3 =
MADD(d3, t2##i, v1, 2);
\
d4 =
MSUB(d4, t2##i, v1, 3);
\
d5 =
MSUB(d5, t2##i, v2, 0);
\
d6 =
MSUB(d6, t2##i, v2, 1);
\
d8 = GiSubtractFloat32(d8, t2##i); \
d0 =
GiSimdFmaLane(d0, t3##i, v2, 2);
\
d1 =
GiSimdFmaLane(d1, t3##i, v2, 3);
\
d2 =
GiFmsqLaneQFloat32(d2, t3##i, v3, 0);
\
d3 =
GiSimdFmaLane(d3, t3##i, v2, 0);
\
d4 =
GiFmsqLaneQFloat32(d4, t3##i, v3, 1);
\
d5 =
GiSimdFmaLane(d5, t3##i, v3, 2);
\
d6 =
GiSimdFmaLane(d6, t3##i, v3, 3);
\
d7 =
GiSimdFmaLane(d7, t3##i, v2, 2);
\
d8 =
GiFmsqLaneQFloat32(d8, t3##i, v0, 3);
\
d0 =
GiSimdFmaLane(d0, t4##i, v0, 3);
\
d1 =
GiSimdFmaLane(d1, t4##i, v4, 0);
\
d2 =
GiSimdFmaLane(d2, t4##i, v4, 1);
\
d3 =
GiFmsqLaneQFloat32(d3, t4##i, v4, 2);
\
d4 =
GiSimdFmaLane(d4, t4##i, v4, 3);
\
d5 =
GiSimdFmaLane(d5, t4##i, v5, 0);
\
d6 =
GiSimdFmaLane(d6, t4##i, v5, 1);
\
d8 =
GiSimdFmaLane(d8, t4##i, v2, 2);
\
d0 =
GiFmsqLaneQFloat32(d0, t5##i, v2, 2);
\
d1 =
GiFmsqLaneQFloat32(d1, t5##i, v5, 2);
\
d2 =
GiFmsqLaneQFloat32(d2, t5##i, v5, 3);
\
d3 =
GiFmsqLaneQFloat32(d3, t5##i, v6, 0);
\
d4 =
GiSimdFmaLane(d4, t5##i, v6, 1);
\
d5 =
GiFmsqLaneQFloat32(d5, t5##i, v5, 2);
\
d6 =
GiFmsqLaneQFloat32(d6, t5##i, v6, 0);
\
d7 =
GiFmsqLaneQFloat32(d7, t5##i, v2, 2);
\
d8 =
GiSimdFmaLane(d8, t5##i, v0, 3);
\
d0 =
GiFmsqLaneQFloat32(d0, t6##i, v0, 0);
\
d1 =
GiFmsqLaneQFloat32(d1, t6##i, v1, 0);
\
d2 =
GiFmsqLaneQFloat32(d2, t6##i, v1, 1);
\
d3 =
GiSimdFmaLane(d3, t6##i, v1, 0);
\
d4 =
GiFmsqLaneQFloat32(d4, t6##i, v3, 1);
\
d0 =
MADD(d0, t3##i, v2, 2);
\
d1 =
MADD(d1, t3##i, v2, 3);
\
d2 =
MSUB(d2, t3##i, v3, 0);
\
d3 =
MADD(d3, t3##i, v2, 0);
\
d4 =
MSUB(d4, t3##i, v3, 1);
\
d5 =
MADD(d5, t3##i, v3, 2);
\
d6 =
MADD(d6, t3##i, v3, 3);
\
d7 =
MADD(d7, t3##i, v2, 2);
\
d8 =
MSUB(d8, t3##i, v0, 3);
\
d0 =
MADD(d0, t4##i, v0, 3);
\
d1 =
MADD(d1, t4##i, v4, 0);
\
d2 =
MADD(d2, t4##i, v4, 1);
\
d3 =
MSUB(d3, t4##i, v4, 2);
\
d4 =
MADD(d4, t4##i, v4, 3);
\
d5 =
MADD(d5, t4##i, v5, 0);
\
d6 =
MADD(d6, t4##i, v5, 1);
\
d8 =
MADD(d8, t4##i, v2, 2);
\
d0 =
MSUB(d0, t5##i, v2, 2);
\
d1 =
MSUB(d1, t5##i, v5, 2);
\
d2 =
MSUB(d2, t5##i, v5, 3);
\
d3 =
MSUB(d3, t5##i, v6, 0);
\
d4 =
MADD(d4, t5##i, v6, 1);
\
d5 =
MSUB(d5, t5##i, v5, 2);
\
d6 =
MSUB(d6, t5##i, v6, 0);
\
d7 =
MSUB(d7, t5##i, v2, 2);
\
d8 =
MADD(d8, t5##i, v0, 3);
\
d0 =
MSUB(d0, t6##i, v0, 0);
\
d1 =
MSUB(d1, t6##i, v1, 0);
\
d2 =
MSUB(d2, t6##i, v1, 1);
\
d3 =
MADD(d3, t6##i, v1, 0);
\
d4 =
MSUB(d4, t6##i, v3, 1);
\
d5 = GiSubtractFloat32(d5, t6##i); \
d6 =
GiFmsqLaneQFloat32(d6, t6##i, v6, 2);
\
d8 =
GiFmsqLaneQFloat32(d8, t6##i, v2, 2);
\
d0 =
GiSimdFmaLane(d0, t0##i, v0, 0);
\
d6 =
MSUB(d6, t6##i, v6, 2);
\
d8 =
MSUB(d8, t6##i, v2, 2);
\
d0 =
MADD(d0, t0##i, v0, 0);
\
GiStoreFloat32( \
input_transform_buf + \
(0 * alpha + i) * ICB * nr_units_in_tile * pack_size + \
...
...
@@ -288,6 +304,8 @@ struct InputTransformF73_NCHW44 {
UNROLL_CALL_RAW
(
9
,
cb
);
#undef cb
#undef MADD
#undef MSUB
}
};
...
...
dnn/src/fallback/general_intrinsic/gi_float.h
浏览文件 @
58ba080d
...
...
@@ -224,9 +224,7 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) {
#endif
#elif defined(GI_SSE2_INTRINSICS)
// fma is coming soon, but right now:
__m128
res
;
res
=
_mm_mul_ps
(
c
,
b
);
return
_mm_add_ps
(
a
,
res
);
return
_mm_add_ps
(
a
,
_mm_mul_ps
(
c
,
b
));
#elif defined(GI_RVV_INTRINSICS)
return
vfmadd_vv_f32m1
(
b
,
c
,
a
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
));
#else
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录