Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wjd2002
Ncnn
提交
7883f4d0
N
Ncnn
项目概览
wjd2002
/
Ncnn
9 个月 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
N
Ncnn
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
7883f4d0
编写于
5月 23, 2023
作者:
N
nihui
提交者:
GitHub
5月 23, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
shadowed variable for less openmp task args (#4744)
上级
1d6bfdca
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
251 addition
and
3 deletion
+251
-3
src/layer/arm/convolution1d_packed.h
src/layer/arm/convolution1d_packed.h
+17
-0
src/layer/arm/convolution1d_packed_bf16s.h
src/layer/arm/convolution1d_packed_bf16s.h
+17
-0
src/layer/arm/convolution1d_packed_fp16s.h
src/layer/arm/convolution1d_packed_fp16s.h
+34
-0
src/layer/arm/convolution_packed.h
src/layer/arm/convolution_packed.h
+20
-0
src/layer/arm/convolution_packed_bf16s.h
src/layer/arm/convolution_packed_bf16s.h
+20
-0
src/layer/arm/convolution_packed_fp16s.h
src/layer/arm/convolution_packed_fp16s.h
+40
-0
src/layer/arm/gemm_arm.cpp
src/layer/arm/gemm_arm.cpp
+16
-0
src/layer/arm/gemm_arm_asimdhp.cpp
src/layer/arm/gemm_arm_asimdhp.cpp
+8
-0
src/layer/arm/gemm_arm_vfpv4.cpp
src/layer/arm/gemm_arm_vfpv4.cpp
+8
-0
src/layer/arm/gemm_fp16s.h
src/layer/arm/gemm_fp16s.h
+10
-0
src/layer/gemm.cpp
src/layer/gemm.cpp
+3
-3
src/layer/x86/convolution1d_packed.h
src/layer/x86/convolution1d_packed.h
+23
-0
src/layer/x86/convolution_packed.h
src/layer/x86/convolution_packed.h
+27
-0
src/layer/x86/gemm_x86.cpp
src/layer/x86/gemm_x86.cpp
+8
-0
未找到文件。
src/layer/arm/convolution1d_packed.h
浏览文件 @
7883f4d0
...
...
@@ -525,6 +525,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
row
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -743,6 +749,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
row
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -939,6 +951,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
float
*
outptr0
=
top_blob
.
row
(
p
);
float
*
outptr1
=
top_blob
.
row
(
p
+
1
);
...
...
src/layer/arm/convolution1d_packed_bf16s.h
浏览文件 @
7883f4d0
...
...
@@ -525,6 +525,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
unsigned
short
*
outptr
=
top_blob
.
row
<
unsigned
short
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -762,6 +768,12 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
unsigned
short
*
outptr
=
top_blob
.
row
<
unsigned
short
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -968,6 +980,11 @@ static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
unsigned
short
*
outptr0
=
top_blob
.
row
<
unsigned
short
>
(
p
);
unsigned
short
*
outptr1
=
top_blob
.
row
<
unsigned
short
>
(
p
+
1
);
...
...
src/layer/arm/convolution1d_packed_fp16s.h
浏览文件 @
7883f4d0
...
...
@@ -474,6 +474,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
row
<
__fp16
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -707,6 +713,12 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
row
<
__fp16
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -887,6 +899,11 @@ static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, co
{
const
int
p
=
remain_outh_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
__fp16
*
outptr0
=
top_blob
.
row
<
__fp16
>
(
p
);
__fp16
*
outptr1
=
top_blob
.
row
<
__fp16
>
(
p
+
1
);
...
...
@@ -1206,6 +1223,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const
int
p
=
remain_outh_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
row
<
__fp16
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -1388,6 +1411,12 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const
int
p
=
remain_outh_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
row
<
__fp16
>
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -1565,6 +1594,11 @@ static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, c
{
const
int
p
=
remain_outh_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
__fp16
*
outptr0
=
top_blob
.
row
<
__fp16
>
(
p
);
__fp16
*
outptr1
=
top_blob
.
row
<
__fp16
>
(
p
+
1
);
...
...
src/layer/arm/convolution_packed.h
浏览文件 @
7883f4d0
...
...
@@ -550,6 +550,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -768,6 +775,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -964,6 +978,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
float
*
outptr0
=
top_blob
.
channel
(
p
);
float
*
outptr1
=
top_blob
.
channel
(
p
+
1
);
...
...
src/layer/arm/convolution_packed_bf16s.h
浏览文件 @
7883f4d0
...
...
@@ -550,6 +550,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -787,6 +794,13 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
unsigned
short
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -993,6 +1007,12 @@ static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
unsigned
short
*
outptr0
=
top_blob
.
channel
(
p
);
unsigned
short
*
outptr1
=
top_blob
.
channel
(
p
+
1
);
...
...
src/layer/arm/convolution_packed_fp16s.h
浏览文件 @
7883f4d0
...
...
@@ -499,6 +499,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -732,6 +739,13 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -912,6 +926,12 @@ static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, cons
{
const
int
p
=
remain_outch_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
__fp16
*
outptr0
=
top_blob
.
channel
(
p
);
__fp16
*
outptr1
=
top_blob
.
channel
(
p
+
1
);
...
...
@@ -1254,6 +1274,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const
int
p
=
remain_outch_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -1435,6 +1462,13 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const
int
p
=
remain_outch_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
__fp16
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -1611,6 +1645,12 @@ static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
{
const
int
p
=
remain_outch_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
__fp16
*
outptr0
=
top_blob
.
channel
(
p
);
__fp16
*
outptr1
=
top_blob
.
channel
(
p
+
1
);
...
...
src/layer/arm/gemm_arm.cpp
浏览文件 @
7883f4d0
...
...
@@ -3839,6 +3839,10 @@ static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -4013,6 +4017,10 @@ static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -4548,6 +4556,10 @@ static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -4724,6 +4736,10 @@ static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
src/layer/arm/gemm_arm_asimdhp.cpp
浏览文件 @
7883f4d0
...
...
@@ -2398,6 +2398,10 @@ static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_bl
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -2572,6 +2576,10 @@ static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& to
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
src/layer/arm/gemm_arm_vfpv4.cpp
浏览文件 @
7883f4d0
...
...
@@ -85,6 +85,10 @@ static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blo
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -261,6 +265,10 @@ static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
src/layer/arm/gemm_fp16s.h
浏览文件 @
7883f4d0
...
...
@@ -268,6 +268,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
unsigned
short
*
pp
=
BT
;
int
jj
=
0
;
#if __aarch64__
for
(;
jj
+
11
<
max_jj
;
jj
+=
12
)
{
const
float
*
p0
=
(
const
float
*
)
B
+
(
j
+
jj
)
*
B_hstep
+
k
;
...
...
@@ -358,6 +359,7 @@ static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, i
pb
++
;
}
}
#endif // __aarch64__
for
(;
jj
+
7
<
max_jj
;
jj
+=
8
)
{
const
float
*
p0
=
(
const
float
*
)
B
+
(
j
+
jj
)
*
B_hstep
+
k
;
...
...
@@ -571,6 +573,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
unsigned
short
*
pp
=
BT
;
int
jj
=
0
;
#if __aarch64__
for
(;
jj
+
11
<
max_jj
;
jj
+=
12
)
{
const
float
*
p0
=
(
const
float
*
)
B
+
k
*
B_hstep
+
(
j
+
jj
);
...
...
@@ -585,6 +588,7 @@ static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int
p0
+=
B_hstep
;
}
}
#endif // __aarch64__
for
(;
jj
+
7
<
max_jj
;
jj
+=
8
)
{
const
float
*
p0
=
(
const
float
*
)
B
+
k
*
B_hstep
+
(
j
+
jj
);
...
...
@@ -1986,6 +1990,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int
jj
=
0
;
#if __aarch64__
for
(;
jj
+
11
<
max_jj
;
jj
+=
12
)
{
float32x4_t
_sum0
;
...
...
@@ -2301,6 +2306,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr
+=
48
;
}
#endif // __aarch64__
for
(;
jj
+
7
<
max_jj
;
jj
+=
8
)
{
float32x4_t
_sum0
;
...
...
@@ -2871,6 +2877,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int
jj
=
0
;
#if __aarch64__
for
(;
jj
+
11
<
max_jj
;
jj
+=
12
)
{
float32x4_t
_sum00
;
...
...
@@ -3042,6 +3049,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr
+=
24
;
}
#endif // __aarch64__
for
(;
jj
+
7
<
max_jj
;
jj
+=
8
)
{
float32x4_t
_sum00
;
...
...
@@ -3517,6 +3525,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
}
int
jj
=
0
;
#if __aarch64__
for
(;
jj
+
11
<
max_jj
;
jj
+=
12
)
{
float32x4_t
_sum0
;
...
...
@@ -3620,6 +3629,7 @@ static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile
outptr
+=
12
;
}
#endif // __aarch64__
for
(;
jj
+
7
<
max_jj
;
jj
+=
8
)
{
float32x4_t
_sum0
;
...
...
src/layer/gemm.cpp
浏览文件 @
7883f4d0
...
...
@@ -174,9 +174,9 @@ int Gemm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
B
=
B0
;
}
int
M
=
A
.
dims
==
3
?
A
.
c
:
A
.
h
;
int
K
=
A
.
w
;
// assert A.w == B.w
int
N
=
B
.
dims
==
3
?
B
.
c
:
B
.
h
;
const
int
M
=
A
.
dims
==
3
?
A
.
c
:
A
.
h
;
const
int
K
=
A
.
w
;
// assert A.w == B.w
const
int
N
=
B
.
dims
==
3
?
B
.
c
:
B
.
h
;
const
float
*
ptrC
=
0
;
int
broadcast_type_C
=
0
;
...
...
src/layer/x86/convolution1d_packed.h
浏览文件 @
7883f4d0
...
...
@@ -999,6 +999,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
pp
*
16
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
row
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -1423,6 +1429,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
row
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -1837,6 +1849,12 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
row
(
p
/
out_elempack
);
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
...
...
@@ -2245,6 +2263,11 @@ static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Ma
{
const
int
p
=
remain_outh_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inh
=
bottom_blob
.
h
*
elempack
;
const
int
outw
=
top_blob
.
w
;
float
*
outptr0
=
top_blob
.
row
(
p
);
float
*
outptr1
=
top_blob
.
row
(
p
+
1
);
...
...
src/layer/x86/convolution_packed.h
浏览文件 @
7883f4d0
...
...
@@ -1024,6 +1024,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
pp
*
16
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -1460,6 +1467,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
8
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -1886,6 +1900,13 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
4
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
const
int
out_elempack
=
top_blob
.
elempack
;
float
*
outptr
=
top_blob
.
channel
(
p
/
out_elempack
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
...
...
@@ -2306,6 +2327,12 @@ static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat&
{
const
int
p
=
remain_outch_start
+
pp
*
2
;
// shadowed variable for less openmp task args
const
int
elempack
=
bottom_blob
.
elempack
;
const
int
inch
=
bottom_blob
.
c
*
elempack
;
const
int
outw
=
top_blob
.
w
;
const
int
outh
=
top_blob
.
h
;
float
*
outptr0
=
top_blob
.
channel
(
p
);
float
*
outptr1
=
top_blob
.
channel
(
p
+
1
);
...
...
src/layer/x86/gemm_x86.cpp
浏览文件 @
7883f4d0
...
...
@@ -6897,6 +6897,10 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
@@ -7071,6 +7075,10 @@ static int gemm_BT_x86(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob,
{
const
int
i
=
ppi
*
TILE_M
;
// shadowed variable for less openmp task args
const
int
M
=
transA
?
A
.
w
:
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
;
const
int
K
=
transA
?
(
A
.
dims
==
3
?
A
.
c
:
A
.
h
)
*
A
.
elempack
:
A
.
w
;
const
int
max_ii
=
std
::
min
((
M
-
i
),
TILE_M
);
Mat
topT_tile
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录