Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wjd2002
Ncnn
提交
5ac17df7
N
Ncnn
项目概览
wjd2002
/
Ncnn
10 个月 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
N
Ncnn
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
5ac17df7
编写于
3月 26, 2023
作者:
N
nihui
提交者:
GitHub
3月 26, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
arm optimization for packed convolution unified elempack (#4590)
上级
8049623d
变更
22
展开全部
隐藏空白更改
内联
并排
Showing
22 changed file
with
4668 addition
and
2261 deletion
+4668
-2261
src/layer/arm/convolution_arm.cpp
src/layer/arm/convolution_arm.cpp
+110
-125
src/layer/arm/convolution_arm_asimdhp.cpp
src/layer/arm/convolution_arm_asimdhp.cpp
+28
-70
src/layer/arm/convolution_bf16s.h
src/layer/arm/convolution_bf16s.h
+0
-90
src/layer/arm/convolution_fp16s.h
src/layer/arm/convolution_fp16s.h
+0
-90
src/layer/arm/convolution_pack1to4.h
src/layer/arm/convolution_pack1to4.h
+0
-129
src/layer/arm/convolution_pack1to4_bf16s.h
src/layer/arm/convolution_pack1to4_bf16s.h
+0
-129
src/layer/arm/convolution_pack1to4_fp16s.h
src/layer/arm/convolution_pack1to4_fp16s.h
+0
-167
src/layer/arm/convolution_pack1to8_fp16s.h
src/layer/arm/convolution_pack1to8_fp16s.h
+0
-90
src/layer/arm/convolution_pack4.h
src/layer/arm/convolution_pack4.h
+0
-173
src/layer/arm/convolution_pack4_bf16s.h
src/layer/arm/convolution_pack4_bf16s.h
+0
-174
src/layer/arm/convolution_pack4_fp16s.h
src/layer/arm/convolution_pack4_fp16s.h
+0
-183
src/layer/arm/convolution_pack4to1.h
src/layer/arm/convolution_pack4to1.h
+0
-132
src/layer/arm/convolution_pack4to1_bf16s.h
src/layer/arm/convolution_pack4to1_bf16s.h
+0
-132
src/layer/arm/convolution_pack4to1_fp16s.h
src/layer/arm/convolution_pack4to1_fp16s.h
+0
-171
src/layer/arm/convolution_pack4to8_fp16s.h
src/layer/arm/convolution_pack4to8_fp16s.h
+0
-98
src/layer/arm/convolution_pack8_fp16s.h
src/layer/arm/convolution_pack8_fp16s.h
+0
-106
src/layer/arm/convolution_pack8to1_fp16s.h
src/layer/arm/convolution_pack8to1_fp16s.h
+0
-93
src/layer/arm/convolution_pack8to4_fp16s.h
src/layer/arm/convolution_pack8to4_fp16s.h
+0
-106
src/layer/arm/convolution_packed.h
src/layer/arm/convolution_packed.h
+1297
-0
src/layer/arm/convolution_packed_bf16s.h
src/layer/arm/convolution_packed_bf16s.h
+1336
-0
src/layer/arm/convolution_packed_fp16s.h
src/layer/arm/convolution_packed_fp16s.h
+1894
-0
src/layer/x86/convolution_packed.h
src/layer/x86/convolution_packed.h
+3
-3
未找到文件。
src/layer/arm/convolution_arm.cpp
浏览文件 @
5ac17df7
...
...
@@ -34,11 +34,12 @@ namespace ncnn {
#include "convolution_5x5.h"
#include "convolution_7x7.h"
#include "convolution_packed.h"
#include "convolution_3x3_winograd.h"
#include "convolution_im2col_gemm.h"
#if NCNN_BF16
#include "convolution_bf16s.h"
#include "convolution_
packed_
bf16s.h"
#include "convolution_3x3_winograd_bf16s.h"
...
...
@@ -56,10 +57,6 @@ namespace ncnn {
#endif // NCNN_INT8
#if __ARM_NEON
#include "convolution_pack4.h"
#include "convolution_pack1to4.h"
#include "convolution_pack4to1.h"
#include "convolution_3x3_pack1to4.h"
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack4to1.h"
...
...
@@ -67,10 +64,6 @@ namespace ncnn {
#include "convolution_7x7_pack1to4.h"
#if NCNN_BF16
#include "convolution_pack4_bf16s.h"
#include "convolution_pack1to4_bf16s.h"
#include "convolution_pack4to1_bf16s.h"
#include "convolution_3x3_pack1to4_bf16s.h"
#include "convolution_3x3_pack4_bf16s.h"
#include "convolution_5x5_pack4_bf16s.h"
...
...
@@ -116,6 +109,42 @@ Convolution_arm::Convolution_arm()
convolution_dilation1
=
0
;
}
static
void
convolution_transform_kernel_packed_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_tm
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
,
int
elempack
,
int
out_elempack
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = pb-pa-kw-kh-inch/pa-outch/pb
{
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_tm
.
create
(
maxk
,
num_input
/
elempack
,
num_output
/
out_elempack
,
(
size_t
)
4u
*
elempack
*
out_elempack
,
elempack
*
out_elempack
);
for
(
int
q
=
0
;
q
+
(
out_elempack
-
1
)
<
num_output
;
q
+=
out_elempack
)
{
float
*
g00
=
weight_data_tm
.
channel
(
q
/
out_elempack
);
for
(
int
p
=
0
;
p
+
(
elempack
-
1
)
<
num_input
;
p
+=
elempack
)
{
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
for
(
int
i
=
0
;
i
<
elempack
;
i
++
)
{
for
(
int
j
=
0
;
j
<
out_elempack
;
j
++
)
{
const
float
*
k00
=
weight_data_r2
.
channel
(
q
+
j
).
row
(
p
+
i
);
g00
[
0
]
=
k00
[
k
];
g00
++
;
}
}
}
}
}
}
}
int
Convolution_arm
::
create_pipeline
(
const
Option
&
opt
)
{
if
(
dynamic_weight
)
...
...
@@ -267,37 +296,32 @@ int Convolution_arm::create_pipeline(const Option& opt)
return
0
;
}
#if __ARM_NEON
// pack4
if
(
elempack
==
4
&&
out_elempack
==
4
)
if
((
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
7
&&
kernel_h
==
7
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
))
{
convolution_transform_kernel_pack
4_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
convolution_transform_kernel_pack
ed_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack1to4
if
(
elempack
==
1
&&
out_elempack
==
4
)
else
if
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
{
conv
olution_transform_kernel_pack1to4_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
conv
3x3s2_transform_kernel_neon
(
weight_data
,
weight_3x3s2_data
,
num_input
,
num_output
);
}
// pack4to1
if
(
elempack
==
4
&&
out_elempack
==
1
)
else
if
((
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
1
&&
kernel_h
==
1
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
1
&&
kernel_h
==
1
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
4
&&
kernel_h
==
4
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
4
&&
stride_h
==
4
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
7
&&
kernel_h
==
7
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
1
&&
kernel_w
==
7
&&
kernel_h
==
7
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
))
{
convolution_transform_kernel_pack4to1_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
)
;
weight_data_tm
=
weight_data
;
}
#endif // __ARM_NEON
// pack1
if
(
elempack
==
1
&&
out_elempack
==
1
)
else
{
if
(
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
{
conv3x3s2_transform_kernel_neon
(
weight_data
,
weight_3x3s2_data
,
num_input
,
num_output
);
}
else
{
weight_data_tm
=
weight_data
;
}
convolution_transform_kernel_packed
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
if
(
opt
.
lightmode
)
...
...
@@ -592,7 +616,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}
else
{
convolution_pack
4_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -627,14 +651,14 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}
else
{
convolution_pack
1to4_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
4
&&
out_elempack
==
1
)
{
{
convolution_pack
4to1_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
#endif // __ARM_NEON
...
...
@@ -715,70 +739,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}
else
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
num_output
;
p
++
)
{
float
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0.
f
;
if
(
bias_term
)
{
sum
=
bias_data
[
p
];
}
const
float
*
kptr
=
(
const
float
*
)
weight_data_tm
+
maxk
*
channels
*
p
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob_bordered
.
channel
(
q
);
const
float
*
sptr
=
m
.
row
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float
val
=
sptr
[
space_ofs
[
k
]];
float
wt
=
kptr
[
k
];
sum
+=
val
*
wt
;
}
kptr
+=
maxk
;
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
convolution_packed
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -894,6 +855,42 @@ int Convolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
}
#if NCNN_BF16
static
void
convolution_transform_kernel_packed_bf16s_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_tm
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
,
int
elempack
,
int
out_elempack
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = pb-pa-kw-kh-inch/pa-outch/pb
{
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_tm
.
create
(
maxk
,
num_input
/
elempack
,
num_output
/
out_elempack
,
(
size_t
)
2u
*
elempack
*
out_elempack
,
elempack
*
out_elempack
);
for
(
int
q
=
0
;
q
+
(
out_elempack
-
1
)
<
num_output
;
q
+=
out_elempack
)
{
unsigned
short
*
g00
=
weight_data_tm
.
channel
(
q
/
out_elempack
);
for
(
int
p
=
0
;
p
+
(
elempack
-
1
)
<
num_input
;
p
+=
elempack
)
{
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
for
(
int
i
=
0
;
i
<
elempack
;
i
++
)
{
for
(
int
j
=
0
;
j
<
out_elempack
;
j
++
)
{
const
float
*
k00
=
weight_data_r2
.
channel
(
q
+
j
).
row
(
p
+
i
);
g00
[
0
]
=
float32_to_bfloat16
(
k00
[
k
]);
g00
++
;
}
}
}
}
}
}
}
int
Convolution_arm
::
create_pipeline_bf16s
(
const
Option
&
opt
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
...
...
@@ -976,30 +973,18 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
return
0
;
}
#if __ARM_NEON
// pack4
if
(
elempack
==
4
&&
out_elempack
==
4
)
{
convolution_transform_kernel_pack4_bf16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
// pack1to4
if
(
elempack
==
1
&&
out_elempack
==
4
)
{
convolution_transform_kernel_pack1to4_bf16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
// pack4to1
if
(
elempack
==
4
&&
out_elempack
==
1
)
if
((
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
7
&&
kernel_h
==
7
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
))
{
convolution_transform_kernel_pack
4to1_bf16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
convolution_transform_kernel_pack
ed_bf16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
#endif // __ARM_NEON
// pack1
if
(
elempack
==
1
&&
out_elempack
==
1
)
else
{
ncnn
::
cast_float32_to_bfloat16
(
weight_data
,
weight_data_tm
,
opt
);
convolution_transform_kernel_packed_bf16s
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
if
(
opt
.
lightmode
)
...
...
@@ -1209,7 +1194,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
4_bf16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_bf16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -1244,14 +1229,14 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
1to4_bf16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_bf16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
4
&&
out_elempack
==
1
)
{
{
convolution_pack
4to1_bf16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_bf16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
#endif // __ARM_NEON
...
...
@@ -1259,7 +1244,7 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
if
(
elempack
==
1
&&
out_elempack
==
1
)
{
{
convolution_bf16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_
packed_
bf16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
src/layer/arm/convolution_arm_asimdhp.cpp
浏览文件 @
5ac17df7
...
...
@@ -27,15 +27,7 @@ namespace ncnn {
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "convolution_fp16s.h"
#include "convolution_pack8_fp16s.h"
#include "convolution_pack1to8_fp16s.h"
#include "convolution_pack4to8_fp16s.h"
#include "convolution_pack8to1_fp16s.h"
#include "convolution_pack8to4_fp16s.h"
#include "convolution_pack4_fp16s.h"
#include "convolution_pack1to4_fp16s.h"
#include "convolution_pack4to1_fp16s.h"
#include "convolution_packed_fp16s.h"
#include "convolution_3x3_winograd_fp16s.h"
...
...
@@ -204,58 +196,22 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
return
0
;
}
// pack8
if
(
elempack
==
8
&&
out_elempack
==
8
)
if
((
elempack
==
8
&&
out_elempack
==
8
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
8
&&
out_elempack
==
8
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
8
&&
out_elempack
==
8
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
8
&&
out_elempack
==
8
&&
kernel_w
==
5
&&
kernel_h
==
5
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
8
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
elempack
==
1
&&
out_elempack
==
8
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
elempack
==
1
&&
out_elempack
==
8
&&
kernel_w
==
7
&&
kernel_h
==
7
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
)
||
(
opt
.
use_fp16_arithmetic
&&
elempack
==
4
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
opt
.
use_fp16_arithmetic
&&
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
1
&&
stride_h
==
1
)
||
(
opt
.
use_fp16_arithmetic
&&
elempack
==
1
&&
out_elempack
==
4
&&
kernel_w
==
3
&&
kernel_h
==
3
&&
dilation_w
==
1
&&
dilation_h
==
1
&&
stride_w
==
2
&&
stride_h
==
2
))
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack1to8
if
(
elempack
==
1
&&
out_elempack
==
8
)
else
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack4to8
if
(
elempack
==
4
&&
out_elempack
==
8
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack8to1
if
(
elempack
==
8
&&
out_elempack
==
1
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack8to4
if
(
elempack
==
8
&&
out_elempack
==
4
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack4
if
(
elempack
==
4
&&
out_elempack
==
4
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack1to4
if
(
elempack
==
1
&&
out_elempack
==
4
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack4to1
if
(
elempack
==
4
&&
out_elempack
==
1
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
}
// pack1
if
(
elempack
==
1
&&
out_elempack
==
1
)
{
convolution_transform_kernel_packed_fp16s_neon
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
convolution_transform_kernel_packed_fp16s
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
if
(
opt
.
use_fp16_arithmetic
)
...
...
@@ -308,22 +264,22 @@ int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
if
(
elempack
==
4
&&
out_elempack
==
4
)
{
convolution_pack
4_fp16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
if
(
elempack
==
1
&&
out_elempack
==
4
)
{
convolution_pack
1to4_fp16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
if
(
elempack
==
4
&&
out_elempack
==
1
)
{
convolution_pack
4to1_fp16s_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
if
(
elempack
==
1
&&
out_elempack
==
1
)
{
convolution_fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_
packed_
fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
return
0
;
...
...
@@ -562,7 +518,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
8_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -597,28 +553,28 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
1to8_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
4
&&
out_elempack
==
8
)
{
{
convolution_pack
4to8_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
8
&&
out_elempack
==
1
)
{
{
convolution_pack
8to1_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
8
&&
out_elempack
==
4
)
{
{
convolution_pack
8to4_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -635,7 +591,7 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
4_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
@@ -661,19 +617,21 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
}
else
{
convolution_pack
1to4_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_pack
ed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
4
&&
out_elempack
==
1
)
{
convolution_pack4to1_fp16sa_neon
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
{
convolution_packed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
if
(
elempack
==
1
&&
out_elempack
==
1
)
{
{
convolution_
fp16s
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
convolution_
packed_fp16sa
(
bottom_blob_bordered
,
top_blob
,
weight_data_tm
,
bias_data_fp16
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
activation_type
,
activation_params
,
opt
);
}
}
...
...
src/layer/arm/convolution_bf16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_bf16s
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_bf16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
unsigned
short
*
kptr
=
(
const
unsigned
short
*
)
weight_data_bf16
+
maxk
*
channels
*
p
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
unsigned
short
*
sptr
=
m
.
row
<
unsigned
short
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float
val
=
bfloat16_to_float32
(
sptr
[
space_ofs
[
k
]]);
float
wt
=
bfloat16_to_float32
(
kptr
[
k
]);
sum
+=
val
*
wt
;
}
kptr
+=
maxk
;
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
float32_to_bfloat16
(
sum
);
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_fp16s
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float
val
=
(
float
)
sptr
[
space_ofs
[
k
]];
float
w
=
(
float
)
kptr
[
k
];
sum
+=
val
*
w
;
}
kptr
+=
maxk
;
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
(
__fp16
)
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_pack1to4.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack1to4_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_pack1to4
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4b-kw-kh-inch-outch/4b
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_pack1to4
.
create
(
maxk
,
num_input
,
num_output
/
4
,
(
size_t
)
4
*
4
,
4
);
for
(
int
q
=
0
;
q
+
3
<
num_output
;
q
+=
4
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
const
Mat
k1
=
weight_data_r2
.
channel
(
q
+
1
);
const
Mat
k2
=
weight_data_r2
.
channel
(
q
+
2
);
const
Mat
k3
=
weight_data_r2
.
channel
(
q
+
3
);
float
*
g00
=
weight_data_pack1to4
.
channel
(
q
/
4
);
for
(
int
p
=
0
;
p
<
num_input
;
p
++
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k10
=
k1
.
row
(
p
);
const
float
*
k20
=
k2
.
row
(
p
);
const
float
*
k30
=
k3
.
row
(
p
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
k00
[
k
];
g00
[
1
]
=
k10
[
k
];
g00
[
2
]
=
k20
[
k
];
g00
[
3
]
=
k30
[
k
];
g00
+=
4
;
}
}
}
}
static
void
convolution_pack1to4_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_pack1to4
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
float
*
kptr
=
(
const
float
*
)
weight_data_pack1to4
+
maxk
*
channels
*
p
*
4
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
float
*
sptr
=
m
.
row
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
// 29.23
{
float32x4_t
_val
=
vdupq_n_f32
(
sptr
[
space_ofs
[
k
]]);
float32x4_t
_w
=
vld1q_f32
(
kptr
);
_sum
=
vmlaq_f32
(
_sum
,
_val
,
_w
);
kptr
+=
4
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1q_f32
(
outptr
+
j
*
4
,
_sum
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack1to4_bf16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack1to4_bf16s_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_bf16
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4b-kw-kh-inch-outch/4b
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_bf16
.
create
(
maxk
,
num_input
,
num_output
/
4
,
(
size_t
)
2
*
4
,
4
);
for
(
int
q
=
0
;
q
+
3
<
num_output
;
q
+=
4
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
const
Mat
k1
=
weight_data_r2
.
channel
(
q
+
1
);
const
Mat
k2
=
weight_data_r2
.
channel
(
q
+
2
);
const
Mat
k3
=
weight_data_r2
.
channel
(
q
+
3
);
unsigned
short
*
g00
=
weight_data_bf16
.
channel
(
q
/
4
);
for
(
int
p
=
0
;
p
<
num_input
;
p
++
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k10
=
k1
.
row
(
p
);
const
float
*
k20
=
k2
.
row
(
p
);
const
float
*
k30
=
k3
.
row
(
p
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
float32_to_bfloat16
(
k00
[
k
]);
g00
[
1
]
=
float32_to_bfloat16
(
k10
[
k
]);
g00
[
2
]
=
float32_to_bfloat16
(
k20
[
k
]);
g00
[
3
]
=
float32_to_bfloat16
(
k30
[
k
]);
g00
+=
4
;
}
}
}
}
static
void
convolution_pack1to4_bf16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_bf16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
unsigned
short
*
kptr
=
weight_data_bf16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
unsigned
short
*
sptr
=
m
.
row
<
const
unsigned
short
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
vdupq_n_f32
(
bfloat16_to_float32
(
sptr
[
space_ofs
[
k
]]));
float32x4_t
_w
=
bfloat2float
(
vld1_u16
(
kptr
));
_sum
=
vmlaq_f32
(
_sum
,
_val
,
_w
);
kptr
+=
4
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_u16
(
outptr
+
j
*
4
,
float2bfloat
(
_sum
));
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack1to4_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack1to4_fp16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
vcvt_f32_f16
(
vdup_n_f16
(
sptr
[
space_ofs
[
k
]]));
float32x4_t
_w
=
vcvt_f32_f16
(
vld1_f16
(
kptr
));
_sum
=
vfmaq_f32
(
_sum
,
_val
,
_w
);
kptr
+=
4
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_f16
(
outptr
+
j
*
4
,
vcvt_f16_f32
(
_sum
));
}
outptr
+=
outw
*
4
;
}
}
}
static
void
convolution_pack1to4_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x4_t
_sum
=
vdup_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1_f16
(
bias_data_ptr
+
p
*
4
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x4_t
_val
=
vdup_n_f16
(
sptr
[
space_ofs
[
k
]]);
float16x4_t
_w
=
vld1_f16
(
kptr
);
_sum
=
vfma_f16
(
_sum
,
_val
,
_w
);
kptr
+=
4
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_f16
(
outptr
+
j
*
4
,
_sum
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack1to8_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack1to8_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x8_t
_sum
=
vdupq_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f16
(
bias_data_ptr
+
p
*
8
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x8_t
_val
=
vdupq_n_f16
(
sptr
[
space_ofs
[
k
]]);
float16x8_t
_w
=
vld1q_f16
(
kptr
);
_sum
=
vfmaq_f16
(
_sum
,
_val
,
_w
);
kptr
+=
8
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1q_f16
(
outptr
+
j
*
8
,
_sum
);
}
outptr
+=
outw
*
8
;
}
}
}
src/layer/arm/convolution_pack4.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack4_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_pack4
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4b-4a-kw-kh-inch/4a-outch/4b
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_pack4
.
create
(
maxk
,
num_input
/
4
,
num_output
/
4
,
(
size_t
)
4
*
16
,
16
);
for
(
int
q
=
0
;
q
+
3
<
num_output
;
q
+=
4
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
const
Mat
k1
=
weight_data_r2
.
channel
(
q
+
1
);
const
Mat
k2
=
weight_data_r2
.
channel
(
q
+
2
);
const
Mat
k3
=
weight_data_r2
.
channel
(
q
+
3
);
float
*
g00
=
weight_data_pack4
.
channel
(
q
/
4
);
for
(
int
p
=
0
;
p
+
3
<
num_input
;
p
+=
4
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k01
=
k0
.
row
(
p
+
1
);
const
float
*
k02
=
k0
.
row
(
p
+
2
);
const
float
*
k03
=
k0
.
row
(
p
+
3
);
const
float
*
k10
=
k1
.
row
(
p
);
const
float
*
k11
=
k1
.
row
(
p
+
1
);
const
float
*
k12
=
k1
.
row
(
p
+
2
);
const
float
*
k13
=
k1
.
row
(
p
+
3
);
const
float
*
k20
=
k2
.
row
(
p
);
const
float
*
k21
=
k2
.
row
(
p
+
1
);
const
float
*
k22
=
k2
.
row
(
p
+
2
);
const
float
*
k23
=
k2
.
row
(
p
+
3
);
const
float
*
k30
=
k3
.
row
(
p
);
const
float
*
k31
=
k3
.
row
(
p
+
1
);
const
float
*
k32
=
k3
.
row
(
p
+
2
);
const
float
*
k33
=
k3
.
row
(
p
+
3
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
k00
[
k
];
g00
[
1
]
=
k10
[
k
];
g00
[
2
]
=
k20
[
k
];
g00
[
3
]
=
k30
[
k
];
g00
[
4
]
=
k01
[
k
];
g00
[
5
]
=
k11
[
k
];
g00
[
6
]
=
k21
[
k
];
g00
[
7
]
=
k31
[
k
];
g00
[
8
]
=
k02
[
k
];
g00
[
9
]
=
k12
[
k
];
g00
[
10
]
=
k22
[
k
];
g00
[
11
]
=
k32
[
k
];
g00
[
12
]
=
k03
[
k
];
g00
[
13
]
=
k13
[
k
];
g00
[
14
]
=
k23
[
k
];
g00
[
15
]
=
k33
[
k
];
g00
+=
16
;
}
}
}
}
static
void
convolution_pack4_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_pack4
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
float
*
kptr
=
(
const
float
*
)
weight_data_pack4
+
maxk
*
channels
*
p
*
16
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
float
*
sptr
=
m
.
row
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
// 29.23
{
float32x4_t
_val
=
vld1q_f32
(
sptr
+
space_ofs
[
k
]
*
4
);
float32x4_t
_w0
=
vld1q_f32
(
kptr
);
float32x4_t
_w1
=
vld1q_f32
(
kptr
+
4
);
float32x4_t
_w2
=
vld1q_f32
(
kptr
+
8
);
float32x4_t
_w3
=
vld1q_f32
(
kptr
+
12
);
#if __aarch64__
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w3
,
_val
,
3
);
#else
_sum
=
vmlaq_lane_f32
(
_sum
,
_w0
,
vget_low_f32
(
_val
),
0
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w1
,
vget_low_f32
(
_val
),
1
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w2
,
vget_high_f32
(
_val
),
0
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w3
,
vget_high_f32
(
_val
),
1
);
#endif
kptr
+=
16
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1q_f32
(
outptr
+
j
*
4
,
_sum
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack4_bf16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack4_bf16s_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_bf16
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4b-4a-kw-kh-inch/4a-outch/4b
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_bf16
.
create
(
maxk
,
num_input
/
4
,
num_output
/
4
,
(
size_t
)
2
*
16
,
16
);
for
(
int
q
=
0
;
q
+
3
<
num_output
;
q
+=
4
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
const
Mat
k1
=
weight_data_r2
.
channel
(
q
+
1
);
const
Mat
k2
=
weight_data_r2
.
channel
(
q
+
2
);
const
Mat
k3
=
weight_data_r2
.
channel
(
q
+
3
);
unsigned
short
*
g00
=
weight_data_bf16
.
channel
(
q
/
4
);
for
(
int
p
=
0
;
p
+
3
<
num_input
;
p
+=
4
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k01
=
k0
.
row
(
p
+
1
);
const
float
*
k02
=
k0
.
row
(
p
+
2
);
const
float
*
k03
=
k0
.
row
(
p
+
3
);
const
float
*
k10
=
k1
.
row
(
p
);
const
float
*
k11
=
k1
.
row
(
p
+
1
);
const
float
*
k12
=
k1
.
row
(
p
+
2
);
const
float
*
k13
=
k1
.
row
(
p
+
3
);
const
float
*
k20
=
k2
.
row
(
p
);
const
float
*
k21
=
k2
.
row
(
p
+
1
);
const
float
*
k22
=
k2
.
row
(
p
+
2
);
const
float
*
k23
=
k2
.
row
(
p
+
3
);
const
float
*
k30
=
k3
.
row
(
p
);
const
float
*
k31
=
k3
.
row
(
p
+
1
);
const
float
*
k32
=
k3
.
row
(
p
+
2
);
const
float
*
k33
=
k3
.
row
(
p
+
3
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
float32_to_bfloat16
(
k00
[
k
]);
g00
[
1
]
=
float32_to_bfloat16
(
k10
[
k
]);
g00
[
2
]
=
float32_to_bfloat16
(
k20
[
k
]);
g00
[
3
]
=
float32_to_bfloat16
(
k30
[
k
]);
g00
[
4
]
=
float32_to_bfloat16
(
k01
[
k
]);
g00
[
5
]
=
float32_to_bfloat16
(
k11
[
k
]);
g00
[
6
]
=
float32_to_bfloat16
(
k21
[
k
]);
g00
[
7
]
=
float32_to_bfloat16
(
k31
[
k
]);
g00
[
8
]
=
float32_to_bfloat16
(
k02
[
k
]);
g00
[
9
]
=
float32_to_bfloat16
(
k12
[
k
]);
g00
[
10
]
=
float32_to_bfloat16
(
k22
[
k
]);
g00
[
11
]
=
float32_to_bfloat16
(
k32
[
k
]);
g00
[
12
]
=
float32_to_bfloat16
(
k03
[
k
]);
g00
[
13
]
=
float32_to_bfloat16
(
k13
[
k
]);
g00
[
14
]
=
float32_to_bfloat16
(
k23
[
k
]);
g00
[
15
]
=
float32_to_bfloat16
(
k33
[
k
]);
g00
+=
16
;
}
}
}
}
static
void
convolution_pack4_bf16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_bf16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
unsigned
short
*
kptr
=
weight_data_bf16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
unsigned
short
*
sptr
=
m
.
row
<
const
unsigned
short
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
bfloat2float
(
vld1_u16
(
sptr
+
space_ofs
[
k
]
*
4
));
float32x4_t
_w0
=
bfloat2float
(
vld1_u16
(
kptr
));
float32x4_t
_w1
=
bfloat2float
(
vld1_u16
(
kptr
+
4
));
float32x4_t
_w2
=
bfloat2float
(
vld1_u16
(
kptr
+
8
));
float32x4_t
_w3
=
bfloat2float
(
vld1_u16
(
kptr
+
12
));
#if __aarch64__
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vmlaq_laneq_f32
(
_sum
,
_w3
,
_val
,
3
);
#else
_sum
=
vmlaq_lane_f32
(
_sum
,
_w0
,
vget_low_f32
(
_val
),
0
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w1
,
vget_low_f32
(
_val
),
1
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w2
,
vget_high_f32
(
_val
),
0
);
_sum
=
vmlaq_lane_f32
(
_sum
,
_w3
,
vget_high_f32
(
_val
),
1
);
#endif
kptr
+=
16
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_u16
(
outptr
+
j
*
4
,
float2bfloat
(
_sum
));
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack4_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack4_fp16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float32x4_t
_sum
=
vdupq_n_f32
(
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f32
(
bias_data_ptr
+
p
*
4
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
vcvt_f32_f16
(
vld1_f16
(
sptr
+
space_ofs
[
k
]
*
4
));
float32x4_t
_w0
=
vcvt_f32_f16
(
vld1_f16
(
kptr
));
float32x4_t
_w1
=
vcvt_f32_f16
(
vld1_f16
(
kptr
+
4
));
float32x4_t
_w2
=
vcvt_f32_f16
(
vld1_f16
(
kptr
+
8
));
float32x4_t
_w3
=
vcvt_f32_f16
(
vld1_f16
(
kptr
+
12
));
_sum
=
vfmaq_laneq_f32
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vfmaq_laneq_f32
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vfmaq_laneq_f32
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vfmaq_laneq_f32
(
_sum
,
_w3
,
_val
,
3
);
kptr
+=
16
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_f16
(
outptr
+
j
*
4
,
vcvt_f16_f32
(
_sum
));
}
outptr
+=
outw
*
4
;
}
}
}
static
void
convolution_pack4_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x4_t
_sum
=
vdup_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1_f16
(
bias_data_ptr
+
p
*
4
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x4_t
_val
=
vld1_f16
(
sptr
+
space_ofs
[
k
]
*
4
);
float16x4_t
_w0
=
vld1_f16
(
kptr
);
float16x4_t
_w1
=
vld1_f16
(
kptr
+
4
);
float16x4_t
_w2
=
vld1_f16
(
kptr
+
8
);
float16x4_t
_w3
=
vld1_f16
(
kptr
+
12
);
_sum
=
vfma_lane_f16
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vfma_lane_f16
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vfma_lane_f16
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vfma_lane_f16
(
_sum
,
_w3
,
_val
,
3
);
kptr
+=
16
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_f16
(
outptr
+
j
*
4
,
_sum
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_pack4to1.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack4to1_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_pack4to1
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4a-kw-kh-inch/4a-outch
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_pack4to1
.
create
(
maxk
,
num_input
/
4
,
num_output
,
(
size_t
)
4
*
4
,
4
);
for
(
int
q
=
0
;
q
<
num_output
;
q
++
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
float
*
g00
=
weight_data_pack4to1
.
channel
(
q
);
for
(
int
p
=
0
;
p
+
3
<
num_input
;
p
+=
4
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k01
=
k0
.
row
(
p
+
1
);
const
float
*
k02
=
k0
.
row
(
p
+
2
);
const
float
*
k03
=
k0
.
row
(
p
+
3
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
k00
[
k
];
g00
[
1
]
=
k01
[
k
];
g00
[
2
]
=
k02
[
k
];
g00
[
3
]
=
k03
[
k
];
g00
+=
4
;
}
}
}
}
static
void
convolution_pack4to1_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_pack4to1
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
float
*
kptr
=
(
const
float
*
)
weight_data_pack4to1
+
maxk
*
channels
*
p
*
4
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
float
*
sptr
=
m
.
row
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
// 29.23
{
float32x4_t
_val
=
vld1q_f32
(
sptr
+
space_ofs
[
k
]
*
4
);
float32x4_t
_w
=
vld1q_f32
(
kptr
);
float32x4_t
_s4
=
vmulq_f32
(
_val
,
_w
);
#if __aarch64__
sum
+=
vaddvq_f32
(
_s4
);
// dot
#else
float32x2_t
_ss
=
vadd_f32
(
vget_low_f32
(
_s4
),
vget_high_f32
(
_s4
));
_ss
=
vpadd_f32
(
_ss
,
_ss
);
sum
+=
vget_lane_f32
(
_ss
,
0
);
#endif
kptr
+=
4
;
}
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_pack4to1_bf16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_transform_kernel_pack4to1_bf16s_neon
(
const
Mat
&
weight_data
,
Mat
&
weight_data_bf16
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = 4a-kw-kh-inch/4a-outch
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_bf16
.
create
(
maxk
,
num_input
/
4
,
num_output
,
(
size_t
)
2
*
4
,
4
);
for
(
int
q
=
0
;
q
<
num_output
;
q
++
)
{
const
Mat
k0
=
weight_data_r2
.
channel
(
q
);
unsigned
short
*
g00
=
weight_data_bf16
.
channel
(
q
);
for
(
int
p
=
0
;
p
+
3
<
num_input
;
p
+=
4
)
{
const
float
*
k00
=
k0
.
row
(
p
);
const
float
*
k01
=
k0
.
row
(
p
+
1
);
const
float
*
k02
=
k0
.
row
(
p
+
2
);
const
float
*
k03
=
k0
.
row
(
p
+
3
);
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
g00
[
0
]
=
float32_to_bfloat16
(
k00
[
k
]);
g00
[
1
]
=
float32_to_bfloat16
(
k01
[
k
]);
g00
[
2
]
=
float32_to_bfloat16
(
k02
[
k
]);
g00
[
3
]
=
float32_to_bfloat16
(
k03
[
k
]);
g00
+=
4
;
}
}
}
}
static
void
convolution_pack4to1_bf16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_bf16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
unsigned
short
*
kptr
=
weight_data_bf16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
unsigned
short
*
sptr
=
m
.
row
<
const
unsigned
short
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
bfloat2float
(
vld1_u16
(
sptr
+
space_ofs
[
k
]
*
4
));
float32x4_t
_w
=
bfloat2float
(
vld1_u16
(
kptr
));
float32x4_t
_s4
=
vmulq_f32
(
_val
,
_w
);
#if __aarch64__
sum
+=
vaddvq_f32
(
_s4
);
// dot
#else
float32x2_t
_ss
=
vadd_f32
(
vget_low_f32
(
_s4
),
vget_high_f32
(
_s4
));
_ss
=
vpadd_f32
(
_ss
,
_ss
);
sum
+=
vget_lane_f32
(
_ss
,
0
);
#endif
kptr
+=
4
;
}
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
float32_to_bfloat16
(
sum
);
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_pack4to1_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack4to1_fp16s_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float32x4_t
_val
=
vcvt_f32_f16
(
vld1_f16
(
sptr
+
space_ofs
[
k
]
*
4
));
float32x4_t
_w
=
vcvt_f32_f16
(
vld1_f16
(
kptr
));
float32x4_t
_s4
=
vmulq_f32
(
_val
,
_w
);
sum
+=
vaddvq_f32
(
_s4
);
// dot
kptr
+=
4
;
}
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
(
__fp16
)
sum
;
}
outptr
+=
outw
;
}
}
}
static
void
convolution_pack4to1_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x4_t
_val
=
vld1_f16
(
sptr
+
space_ofs
[
k
]
*
4
);
float16x4_t
_w
=
vld1_f16
(
kptr
);
float16x4_t
_s4
=
vmul_f16
(
_val
,
_w
);
sum
+=
vaddvq_f32
(
vcvt_f32_f16
(
_s4
));
// dot
kptr
+=
4
;
}
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_pack4to8_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack4to8_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x8_t
_sum
=
vdupq_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f16
(
bias_data_ptr
+
p
*
8
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
4
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x4_t
_val
=
vld1_f16
(
sptr
+
space_ofs
[
k
]
*
4
);
float16x8_t
_w0
=
vld1q_f16
(
kptr
);
float16x8_t
_w1
=
vld1q_f16
(
kptr
+
8
);
float16x8_t
_w2
=
vld1q_f16
(
kptr
+
16
);
float16x8_t
_w3
=
vld1q_f16
(
kptr
+
24
);
_sum
=
vfmaq_lane_f16
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vfmaq_lane_f16
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vfmaq_lane_f16
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vfmaq_lane_f16
(
_sum
,
_w3
,
_val
,
3
);
kptr
+=
32
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1q_f16
(
outptr
+
j
*
8
,
_sum
);
}
outptr
+=
outw
*
8
;
}
}
}
src/layer/arm/convolution_pack8_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack8_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x8_t
_sum
=
vdupq_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1q_f16
(
bias_data_ptr
+
p
*
8
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
8
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x8_t
_val
=
vld1q_f16
(
sptr
+
space_ofs
[
k
]
*
8
);
float16x8_t
_w0
=
vld1q_f16
(
kptr
);
float16x8_t
_w1
=
vld1q_f16
(
kptr
+
8
);
float16x8_t
_w2
=
vld1q_f16
(
kptr
+
16
);
float16x8_t
_w3
=
vld1q_f16
(
kptr
+
24
);
float16x8_t
_w4
=
vld1q_f16
(
kptr
+
32
);
float16x8_t
_w5
=
vld1q_f16
(
kptr
+
40
);
float16x8_t
_w6
=
vld1q_f16
(
kptr
+
48
);
float16x8_t
_w7
=
vld1q_f16
(
kptr
+
56
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w3
,
_val
,
3
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w4
,
_val
,
4
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w5
,
_val
,
5
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w6
,
_val
,
6
);
_sum
=
vfmaq_laneq_f16
(
_sum
,
_w7
,
_val
,
7
);
kptr
+=
64
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1q_f16
(
outptr
+
j
*
8
,
_sum
);
}
outptr
+=
outw
*
8
;
}
}
}
src/layer/arm/convolution_pack8to1_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack8to1_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
float
*
bias_data_ptr
=
bias_data
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float
sum
=
0
.
f
;
if
(
bias_data_ptr
)
{
sum
=
bias_data_ptr
[
p
];
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
8
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x8_t
_val
=
vld1q_f16
(
sptr
+
space_ofs
[
k
]
*
8
);
float16x8_t
_w
=
vld1q_f16
(
kptr
);
float16x8_t
_s8
=
vmulq_f16
(
_val
,
_w
);
float16x4_t
_s4
=
vadd_f16
(
vget_low_f16
(
_s8
),
vget_high_f16
(
_s8
));
sum
+=
vaddvq_f32
(
vcvt_f32_f16
(
_s4
));
// dot
kptr
+=
8
;
}
}
sum
=
activation_ss
(
sum
,
activation_type
,
activation_params
);
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/arm/convolution_pack8to4_fp16s.h
已删除
100644 → 0
浏览文件 @
8049623d
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack8to4_fp16sa_neon
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_fp16
,
const
Mat
&
bias_data_fp16
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
int
activation_type
,
const
Mat
&
activation_params
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
const
__fp16
*
bias_data_ptr
=
bias_data_fp16
;
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
__fp16
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
float16x4_t
_sum
=
vdup_n_f16
((
__fp16
)
0
.
f
);
if
(
bias_data_ptr
)
{
_sum
=
vld1_f16
(
bias_data_ptr
+
p
*
4
);
}
const
__fp16
*
kptr
=
weight_data_fp16
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
__fp16
*
sptr
=
m
.
row
<
const
__fp16
>
(
i
*
stride_h
)
+
j
*
stride_w
*
8
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
float16x8_t
_val
=
vld1q_f16
(
sptr
+
space_ofs
[
k
]
*
8
);
float16x4_t
_w0
=
vld1_f16
(
kptr
);
float16x4_t
_w1
=
vld1_f16
(
kptr
+
4
);
float16x4_t
_w2
=
vld1_f16
(
kptr
+
8
);
float16x4_t
_w3
=
vld1_f16
(
kptr
+
12
);
float16x4_t
_w4
=
vld1_f16
(
kptr
+
16
);
float16x4_t
_w5
=
vld1_f16
(
kptr
+
20
);
float16x4_t
_w6
=
vld1_f16
(
kptr
+
24
);
float16x4_t
_w7
=
vld1_f16
(
kptr
+
28
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w0
,
_val
,
0
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w1
,
_val
,
1
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w2
,
_val
,
2
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w3
,
_val
,
3
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w4
,
_val
,
4
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w5
,
_val
,
5
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w6
,
_val
,
6
);
_sum
=
vfma_laneq_f16
(
_sum
,
_w7
,
_val
,
7
);
kptr
+=
32
;
}
}
_sum
=
activation_ps
(
_sum
,
activation_type
,
activation_params
);
vst1_f16
(
outptr
+
j
*
4
,
_sum
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/arm/convolution_packed.h
0 → 100644
浏览文件 @
5ac17df7
此差异已折叠。
点击以展开。
src/layer/arm/convolution_packed_bf16s.h
0 → 100644
浏览文件 @
5ac17df7
此差异已折叠。
点击以展开。
src/layer/arm/convolution_packed_fp16s.h
0 → 100644
浏览文件 @
5ac17df7
此差异已折叠。
点击以展开。
src/layer/x86/convolution_packed.h
浏览文件 @
5ac17df7
...
...
@@ -71,7 +71,7 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t
#endif // __AVX__
if
(
inch
>=
4
)
kernel_tm
.
create
(
4
*
4
*
maxk
,
inch
/
4
+
(
inch
%
4
)
/
2
+
inch
%
2
,
outch
/
4
+
(
outch
%
4
)
/
2
+
outch
%
2
);
if
(
inch
>=
2
)
else
if
(
inch
>=
2
)
kernel_tm
.
create
(
4
*
2
*
maxk
,
inch
/
2
+
inch
%
2
,
outch
/
4
+
(
outch
%
4
)
/
2
+
outch
%
2
);
else
kernel_tm
.
create
(
4
*
maxk
,
inch
,
outch
/
4
+
(
outch
%
4
)
/
2
+
outch
%
2
);
...
...
@@ -93,7 +93,7 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t
#endif // __AVX__
if
(
inch
>=
4
)
kernel_tm
.
create
(
2
*
4
*
maxk
,
inch
/
4
+
(
inch
%
4
)
/
2
+
inch
%
2
,
outch
/
2
+
outch
%
2
);
if
(
inch
>=
2
)
else
if
(
inch
>=
2
)
kernel_tm
.
create
(
2
*
2
*
maxk
,
inch
/
2
+
inch
%
2
,
outch
/
2
+
outch
%
2
);
else
#endif // __SSE2__
...
...
@@ -114,7 +114,7 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t
#endif // __AVX__
if
(
inch
>=
4
)
kernel_tm
.
create
(
4
*
maxk
,
inch
/
4
+
(
inch
%
4
)
/
2
+
inch
%
2
,
outch
);
if
(
inch
>=
2
)
else
if
(
inch
>=
2
)
kernel_tm
.
create
(
2
*
maxk
,
inch
/
2
+
inch
%
2
,
outch
);
else
#endif // __SSE2__
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录