Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
e3a8a08e
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
e3a8a08e
编写于
10月 08, 2018
作者:
L
liutuo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimize deconv omp
上级
4d9c948a
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
40 addition
and
63 deletion
+40
-63
mace/kernels/arm/deconv_2d_neon.h
mace/kernels/arm/deconv_2d_neon.h
+0
-4
mace/kernels/arm/deconv_2d_neon_3x3.cc
mace/kernels/arm/deconv_2d_neon_3x3.cc
+2
-16
mace/kernels/arm/deconv_2d_neon_4x4.cc
mace/kernels/arm/deconv_2d_neon_4x4.cc
+3
-14
mace/kernels/deconv_2d.h
mace/kernels/deconv_2d.h
+33
-24
mace/ops/deconv_2d_benchmark.cc
mace/ops/deconv_2d_benchmark.cc
+2
-5
未找到文件。
mace/kernels/arm/deconv_2d_neon.h
浏览文件 @
e3a8a08e
...
...
@@ -26,28 +26,24 @@ namespace kernels {
void
Deconv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK4x4S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK4x4S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
...
...
mace/kernels/arm/deconv_2d_neon_3x3.cc
浏览文件 @
e3a8a08e
...
...
@@ -20,7 +20,6 @@ namespace kernels {
void
Deconv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
...
...
@@ -40,12 +39,6 @@ void Deconv2dNeonK3x3S1(const float *input,
if
(
oc
+
1
<
outch
)
{
float
*
out_base0
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base1
=
out_base0
+
out_img_size
;
const
float
bias_value0
=
bias
?
bias
[
oc
]
:
0.
f
;
const
float
bias_value1
=
bias
?
bias
[
oc
+
1
]
:
0.
f
;
std
::
fill_n
(
out_base0
,
out_img_size
,
bias_value0
);
std
::
fill_n
(
out_base1
,
out_img_size
,
bias_value1
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base0
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
...
...
@@ -197,8 +190,6 @@ void Deconv2dNeonK3x3S1(const float *input,
}
}
else
{
float
*
out_base0
=
output
+
(
b
*
outch
+
oc
)
*
outh
*
outw
;
const
float
bias_value0
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base0
,
outh
*
outw
,
bias_value0
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base0
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
...
...
@@ -290,7 +281,6 @@ void Deconv2dNeonK3x3S1(const float *input,
void
Deconv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
...
...
@@ -303,15 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(
2
)
#pragma omp parallel for collapse(
3
)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
const
float
*
in
=
input_base
;
...
...
mace/kernels/arm/deconv_2d_neon_4x4.cc
浏览文件 @
e3a8a08e
...
...
@@ -20,7 +20,6 @@ namespace kernels {
void
Deconv2dNeonK4x4S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
...
...
@@ -32,16 +31,12 @@ void Deconv2dNeonK4x4S1(const float *input,
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for
#pragma omp parallel for
collapse(2)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base1
=
out_base
+
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
const
float
bias_value1
=
bias
?
bias
[
oc
+
1
]
:
0.
f
;
std
::
fill_n
(
out_base1
,
out_img_size
,
bias_value1
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
in
=
input_base
;
...
...
@@ -257,8 +252,6 @@ void Deconv2dNeonK4x4S1(const float *input,
}
}
else
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
q
)
*
16
;
...
...
@@ -381,7 +374,6 @@ void Deconv2dNeonK4x4S1(const float *input,
void
Deconv2dNeonK4x4S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
...
...
@@ -394,14 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input,
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for
#pragma omp parallel for
collapse(3)
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
p
]
:
0.
f
;
std
::
fill_n
(
out_base
,
outh
*
outw
,
bias_value
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
p
*
inch
+
q
)
*
16
;
const
float
*
in
=
input_base
;
...
...
mace/kernels/deconv_2d.h
浏览文件 @
e3a8a08e
...
...
@@ -184,7 +184,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
void
Deconv2dGeneral
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
kernel_h
,
const
index_t
kernel_w
,
const
int
*
strides
,
...
...
@@ -206,23 +205,25 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
}
}
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
int
oc
=
0
;
oc
<
out_shape
[
1
];
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
out_shape
[
1
]
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
const
index_t
batch
=
in_shape
[
0
];
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
in_channels
=
in_shape
[
1
];
#pragma omp parallel for collapse(4)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
oc
=
0
;
oc
<
out_channels
;
++
oc
)
{
for
(
int
i
=
0
;
i
<
in_height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
in_width
;
++
j
)
{
float
*
out_base
=
output
+
(
b
*
out_channels
+
oc
)
*
out_img_size
;
const
index_t
out_offset
=
i
*
strides
[
0
]
*
out_width
+
j
*
strides
[
1
];
for
(
int
ic
=
0
;
ic
<
in_
shape
[
1
]
;
++
ic
)
{
for
(
int
ic
=
0
;
ic
<
in_
channels
;
++
ic
)
{
const
index_t
input_idx
=
(
b
*
in_
shape
[
1
]
+
ic
)
*
in_img_size
+
i
*
in_width
+
j
;
(
b
*
in_
channels
+
ic
)
*
in_img_size
+
i
*
in_width
+
j
;
const
float
val
=
input
[
input_idx
];
const
index_t
kernel_offset
=
(
oc
*
in_
shape
[
1
]
+
ic
)
*
kernel_size
;
(
oc
*
in_
channels
+
ic
)
*
kernel_size
;
for
(
int
k
=
0
;
k
<
kernel_size
;
++
k
)
{
const
index_t
out_idx
=
out_offset
+
index_map
[
k
];
const
index_t
kernel_idx
=
kernel_offset
+
k
;
...
...
@@ -248,7 +249,7 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
#pragma omp parallel for
#pragma omp parallel for
collapse(3)
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
k
=
0
;
k
<
out_height
;
++
k
)
{
...
...
@@ -324,7 +325,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
"Input/Output batch size mismatch"
);
std
::
function
<
void
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
>
deconv_func
;
...
...
@@ -354,6 +354,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
scratch
->
Rewind
();
scratch
->
GrowSize
(
padded_out_size
);
Tensor
padded_out
(
scratch
->
Scratch
(
padded_out_size
),
DT_FLOAT
);
padded_out
.
Reshape
(
padded_out_shape
);
padded_out
.
Clear
();
auto
*
padded_out_data
=
padded_out
.
mutable_data
<
float
>
();
bool
use_neon_3x3_s1
=
kernel_h
==
kernel_w
&&
kernel_h
==
3
&&
...
...
@@ -369,13 +371,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
if
(
use_neon_3x3_s1
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK3x3S1
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
...
...
@@ -383,13 +383,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
}
else
if
(
use_neon_3x3_s2
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK3x3S2
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
...
...
@@ -397,13 +395,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
}
else
if
(
use_neon_4x4_s1
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK4x4S1
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
...
...
@@ -411,13 +407,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
}
else
if
(
use_neon_4x4_s2
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK4x4S2
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
...
...
@@ -425,13 +419,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
}
else
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dGeneral
(
input
,
filter
,
bias
,
kernel_h
,
kernel_w
,
strides_
.
data
(),
...
...
@@ -444,9 +436,24 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
bool
no_pad
=
padded_out_h
==
output_shape
[
2
]
&&
padded_out_w
==
output_shape
[
3
];
float
*
out_data
=
no_pad
?
output_data
:
padded_out_data
;
if
(
bias_data
!=
nullptr
)
{
const
index_t
batch
=
output_shape
[
0
];
const
index_t
channels
=
output_shape
[
1
];
const
index_t
img_size
=
output_shape
[
2
]
*
output_shape
[
3
];
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
i
=
0
;
i
<
img_size
;
++
i
)
{
output_data
[(
b
*
channels
+
c
)
*
img_size
+
i
]
+=
bias_data
[
c
];
}
}
}
}
deconv_func
(
input_data
,
filter_data
,
bias_data
,
in_shape
,
padded_out_shape
.
data
(),
out_data
);
...
...
@@ -459,6 +466,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
output_data
);
}
DoActivation
<
float
>
(
output_data
,
output_data
,
output
->
size
(),
...
...
mace/ops/deconv_2d_benchmark.cc
浏览文件 @
e3a8a08e
...
...
@@ -120,15 +120,12 @@ static void Deconv2d(int iters,
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
MACE_BM_DECONV_2D
(
1
,
128
,
15
,
15
,
1
,
1
,
1
,
15
,
15
,
VALID
,
256
);
MACE_BM_DECONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
60
,
60
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
62
,
62
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
60
,
60
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
128
,
60
,
60
,
4
,
4
,
1
,
63
,
63
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
32
,
60
,
60
,
4
,
4
,
1
,
60
,
60
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
4
,
4
,
2
,
448
,
448
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
4
,
4
,
2
,
450
,
450
,
VALID
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
1023
,
1023
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
128
,
16
,
16
,
5
,
5
,
1
,
20
,
20
,
VALID
,
32
);
...
...
@@ -138,10 +135,10 @@ MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);
MACE_BM_DECONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
32
,
32
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
64
,
33
,
32
,
3
,
3
,
2
,
65
,
63
,
SAME
,
128
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
447
,
447
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
449
,
449
,
VALID
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
448
,
448
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
32
,
1014
,
762
,
9
,
9
,
2
,
2035
,
1531
,
VALID
,
1
);
}
// namespace test
}
// namespace ops
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录