Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
毕竟曾有刹那
Mace
提交
ea2da73a
Mace
项目概览
毕竟曾有刹那
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ea2da73a
编写于
11月 26, 2018
作者:
L
liutuo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix deconv neon bug
上级
5059f1c0
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
45 addition
and
39 deletion
+45
-39
mace/ops/arm/deconv_2d_neon_3x3.cc
mace/ops/arm/deconv_2d_neon_3x3.cc
+2
-1
mace/ops/arm/deconv_2d_neon_4x4.cc
mace/ops/arm/deconv_2d_neon_4x4.cc
+14
-13
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+4
-2
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+25
-23
未找到文件。
mace/ops/arm/deconv_2d_neon_3x3.cc
浏览文件 @
ea2da73a
...
...
@@ -319,7 +319,7 @@ void Deconv2dNeonK3x3S2(const float *input,
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// out row 0
...
...
@@ -365,6 +365,7 @@ void Deconv2dNeonK3x3S2(const float *input,
out_row_0
+=
8
;
out_row_1
+=
8
;
out_row_2
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
...
...
mace/ops/arm/deconv_2d_neon_4x4.cc
浏览文件 @
ea2da73a
...
...
@@ -32,12 +32,12 @@ void Deconv2dNeonK4x4S1(const float *input,
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
for
(
in
dex_
t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
in
dex_
t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base1
=
out_base
+
out_img_size
;
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
for
(
in
dex_
t
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
in
=
input_base
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
q
)
*
16
;
...
...
@@ -62,7 +62,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float32x4_t
k12_vec
=
vld1q_f32
(
k12
);
float32x4_t
k13_vec
=
vld1q_f32
(
k13
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
...
...
@@ -77,7 +77,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float
*
out_row1_2
=
out_row1_1
+
outw
;
float
*
out_row1_3
=
out_row1_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
...
...
@@ -252,7 +252,7 @@ void Deconv2dNeonK4x4S1(const float *input,
}
}
else
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
for
(
in
dex_
t
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
q
)
*
16
;
const
float
*
in
=
input_base
;
...
...
@@ -266,7 +266,7 @@ void Deconv2dNeonK4x4S1(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
...
...
@@ -387,10 +387,10 @@ void Deconv2dNeonK4x4S2(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
for
(
in
dex_
t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
in
dex_
t
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
for
(
in
dex_
t
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
p
*
inch
+
q
)
*
16
;
const
float
*
in
=
input_base
;
...
...
@@ -405,7 +405,7 @@ void Deconv2dNeonK4x4S2(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
2
*
i
*
outw
;
float
*
out_row_0
=
out_row
;
...
...
@@ -413,9 +413,9 @@ void Deconv2dNeonK4x4S2(const float *input,
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// row 0
...
...
@@ -479,6 +479,7 @@ void Deconv2dNeonK4x4S2(const float *input,
out_row_1
+=
8
;
out_row_2
+=
8
;
out_row_3
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
...
...
mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
浏览文件 @
ea2da73a
...
...
@@ -163,7 +163,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// out row 0
...
...
@@ -209,6 +209,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
out_row_0
+=
8
;
out_row_1
+=
8
;
out_row_2
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
...
...
@@ -554,7 +555,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// out row 0
...
...
@@ -600,6 +601,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
out_row_0
+=
8
;
out_row_1
+=
8
;
out_row_2
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
...
...
mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
浏览文件 @
ea2da73a
...
...
@@ -34,8 +34,8 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
in
dex_
t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
in
dex_
t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
float
*
out_base
=
output
+
offset
*
out_img_size
;
const
float
*
input_base
=
input
+
offset
*
in_img_size
;
...
...
@@ -51,13 +51,13 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
...
...
@@ -170,8 +170,8 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
in
dex_
t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
in
dex_
t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
offset
=
b
*
channels
+
c
;
float
*
out_base
=
output
+
offset
*
out_img_size
;
const
float
*
input_base
=
input
+
offset
*
in_img_size
;
...
...
@@ -188,7 +188,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
2
*
i
*
outw
;
float
*
out_row_0
=
out_row
;
...
...
@@ -196,9 +196,9 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// row 0
...
...
@@ -262,6 +262,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
out_row_1
+=
8
;
out_row_2
+=
8
;
out_row_3
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
...
...
@@ -304,15 +305,15 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
in
dex_
t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
for
(
in
dex_
t
oc
=
0
;
oc
<
outch_g
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch_g
)
{
const
index_t
out_offset
=
(
b
*
outch
+
outch_g
*
g
+
oc
)
*
out_img_size
;
float
*
out_base
=
output
+
out_offset
;
float
*
out_base1
=
out_base
+
out_img_size
;
for
(
int
ic
=
0
;
ic
<
inch_g
;
ic
++
)
{
for
(
in
dex_
t
ic
=
0
;
ic
<
inch_g
;
ic
++
)
{
const
index_t
in_offset
=
(
b
*
inch
+
inch_g
*
g
+
ic
)
*
in_img_size
;
const
float
*
input_base
=
input
+
in_offset
;
...
...
@@ -341,7 +342,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float32x4_t
k12_vec
=
vld1q_f32
(
k12
);
float32x4_t
k13_vec
=
vld1q_f32
(
k13
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
...
...
@@ -356,7 +357,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float
*
out_row1_2
=
out_row1_1
+
outw
;
float
*
out_row1_3
=
out_row1_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
...
...
@@ -533,7 +534,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const
index_t
out_offset
=
(
b
*
outch
+
outch_g
*
g
+
oc
)
*
out_img_size
;
float
*
out_base
=
output
+
out_offset
;
for
(
int
ic
=
0
;
ic
<
inch_g
;
++
ic
)
{
for
(
in
dex_
t
ic
=
0
;
ic
<
inch_g
;
++
ic
)
{
const
index_t
in_offset
=
(
b
*
inch
+
inch_g
*
g
+
ic
)
*
in_img_size
;
const
index_t
kernel_offset
=
...
...
@@ -552,13 +553,13 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
...
...
@@ -679,13 +680,13 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const
index_t
outch_g
=
outch
/
group
;
#pragma omp parallel for collapse(3)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
in
dex_
t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
for
(
int
oc
=
0
;
oc
<
outch_g
;
oc
++
)
{
for
(
in
dex_
t
oc
=
0
;
oc
<
outch_g
;
oc
++
)
{
const
index_t
out_offset
=
(
b
*
outch
+
outch_g
*
g
+
oc
)
*
out_img_size
;
float
*
out_base
=
output
+
out_offset
;
for
(
int
ic
=
0
;
ic
<
inch_g
;
ic
++
)
{
for
(
in
dex_
t
ic
=
0
;
ic
<
inch_g
;
ic
++
)
{
const
index_t
in_offset
=
(
b
*
inch
+
inch_g
*
g
+
ic
)
*
in_img_size
;
const
index_t
kernel_offset
=
...
...
@@ -704,7 +705,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
for
(
in
dex_
t
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
2
*
i
*
outw
;
float
*
out_row_0
=
out_row
;
...
...
@@ -712,9 +713,9 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
in
dex_
t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(
;
j
+
3
<
w
;
j
+=
4
)
{
for
(
index_t
n
=
0
;
n
+
9
<
outw
;
n
+=
8
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// row 0
...
...
@@ -778,6 +779,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
out_row_1
+=
8
;
out_row_2
+=
8
;
out_row_3
+=
8
;
j
+=
4
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录