Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
c2523ee2
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
c2523ee2
编写于
9月 28, 2018
作者:
刘
刘托
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'optimize_deconv_cpu' into 'master'
optimize deconv cpu See merge request !797
上级
e2a40a03
ce1594a6
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
1282 addition
and
88 deletion
+1282
-88
mace/examples/android/gradle/wrapper/gradle-wrapper.properties
...examples/android/gradle/wrapper/gradle-wrapper.properties
+1
-1
mace/kernels/arm/deconv_2d_neon.h
mace/kernels/arm/deconv_2d_neon.h
+100
-0
mace/kernels/arm/deconv_2d_neon_3x3.cc
mace/kernels/arm/deconv_2d_neon_3x3.cc
+405
-0
mace/kernels/arm/deconv_2d_neon_4x4.cc
mace/kernels/arm/deconv_2d_neon_4x4.cc
+516
-0
mace/kernels/deconv_2d.h
mace/kernels/deconv_2d.h
+245
-82
mace/kernels/opencl/deconv_2d.cc
mace/kernels/opencl/deconv_2d.cc
+4
-1
mace/ops/deconv_2d_benchmark.cc
mace/ops/deconv_2d_benchmark.cc
+7
-0
mace/ops/deconv_2d_test.cc
mace/ops/deconv_2d_test.cc
+4
-4
未找到文件。
mace/examples/android/gradle/wrapper/gradle-wrapper.properties
浏览文件 @
c2523ee2
#Wed
May 02 11:53:35
CST 2018
#Wed
Sep 12 13:35:24
CST 2018
distributionBase
=
GRADLE_USER_HOME
distributionPath
=
wrapper/dists
zipStoreBase
=
GRADLE_USER_HOME
...
...
mace/kernels/arm/deconv_2d_neon.h
0 → 100644
浏览文件 @
c2523ee2
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_
#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace
mace
{
namespace
kernels
{
void
Deconv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK4x4S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
void
Deconv2dNeonK4x4S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
#ifdef MACE_ENABLE_NEON
inline
float32x4_t
neon_vfma_lane_0
(
float32x4_t
a
,
float32x4_t
b
,
float32x4_t
c
)
{
#ifdef __aarch64__
return
vfmaq_laneq_f32
(
a
,
b
,
c
,
0
);
#else
return
vmlaq_lane_f32
(
a
,
b
,
vget_low_f32
(
c
),
0
);
#endif
}
inline
float32x4_t
neon_vfma_lane_1
(
float32x4_t
a
,
float32x4_t
b
,
float32x4_t
c
)
{
#ifdef __aarch64__
return
vfmaq_laneq_f32
(
a
,
b
,
c
,
1
);
#else
return
vmlaq_lane_f32
(
a
,
b
,
vget_low_f32
(
c
),
1
);
#endif
}
inline
float32x4_t
neon_vfma_lane_2
(
float32x4_t
a
,
float32x4_t
b
,
float32x4_t
c
)
{
#ifdef __aarch64__
return
vfmaq_laneq_f32
(
a
,
b
,
c
,
2
);
#else
return
vmlaq_lane_f32
(
a
,
b
,
vget_high_f32
(
c
),
0
);
#endif
}
inline
float32x4_t
neon_vfma_lane_3
(
float32x4_t
a
,
float32x4_t
b
,
float32x4_t
c
)
{
#ifdef __aarch64__
return
vfmaq_laneq_f32
(
a
,
b
,
c
,
3
);
#else
return
vmlaq_lane_f32
(
a
,
b
,
vget_high_f32
(
c
),
1
);
#endif
}
#endif
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_ARM_DECONV_2D_NEON_H_
mace/kernels/arm/deconv_2d_neon_3x3.cc
0 → 100644
浏览文件 @
c2523ee2
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
namespace
mace
{
namespace
kernels
{
void
Deconv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
inch
=
in_shape
[
1
];
const
index_t
h
=
in_shape
[
2
];
const
index_t
w
=
in_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
outh
=
out_shape
[
2
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
float
*
out_base0
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base1
=
out_base0
+
out_img_size
;
const
float
bias_value0
=
bias
?
bias
[
oc
]
:
0.
f
;
const
float
bias_value1
=
bias
?
bias
[
oc
+
1
]
:
0.
f
;
std
::
fill_n
(
out_base0
,
out_img_size
,
bias_value0
);
std
::
fill_n
(
out_base1
,
out_img_size
,
bias_value1
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base0
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
const
float
*
kernel_base1
=
kernel_base0
+
inch
*
9
;
const
float
*
in
=
input_base
;
// output channel 0
const
float
*
k0_0
=
kernel_base0
;
const
float
*
k0_1
=
kernel_base0
+
3
;
const
float
*
k0_2
=
kernel_base0
+
5
;
// output channel 1
const
float
*
k1_0
=
kernel_base1
;
const
float
*
k1_1
=
kernel_base1
+
3
;
const
float
*
k1_2
=
kernel_base1
+
5
;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t
k00_vec
,
k01_vec
,
k02_vec
;
float32x4_t
k10_vec
,
k11_vec
,
k12_vec
;
k00_vec
=
vld1q_f32
(
k0_0
);
k01_vec
=
vld1q_f32
(
k0_1
);
k02_vec
=
vld1q_f32
(
k0_2
);
k10_vec
=
vld1q_f32
(
k1_0
);
k11_vec
=
vld1q_f32
(
k1_1
);
k12_vec
=
vld1q_f32
(
k1_2
);
#endif
for
(
index_t
i
=
0
;
i
<
h
;
++
i
)
{
float
*
out_row_base0
=
out_base0
+
i
*
outw
;
float
*
out_row0_0
=
out_row_base0
;
float
*
out_row0_1
=
out_row_base0
+
outw
;
float
*
out_row0_2
=
out_row_base0
+
2
*
outw
;
float
*
out_row_base1
=
out_base1
+
i
*
outw
;
float
*
out_row1_0
=
out_row_base1
;
float
*
out_row1_1
=
out_row_base1
+
outw
;
float
*
out_row1_2
=
out_row_base1
+
2
*
outw
;
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
float32x4_t
out00
,
out01
,
out02
;
float32x4_t
out10
,
out11
,
out12
;
float32x4_t
out20
,
out21
,
out22
;
out00
=
vld1q_f32
(
out_row0_0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
,
out00
);
out01
=
vld1q_f32
(
out_row0_0
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
+
1
,
out01
);
out02
=
vld1q_f32
(
out_row0_0
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
+
2
,
out02
);
out10
=
vld1q_f32
(
out_row0_1
+
0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
0
,
out10
);
out11
=
vld1q_f32
(
out_row0_1
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
1
,
out11
);
out12
=
vld1q_f32
(
out_row0_1
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
2
,
out12
);
out20
=
vld1q_f32
(
out_row0_2
+
0
);
out20
=
neon_vfma_lane_1
(
out20
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
0
,
out20
);
out21
=
vld1q_f32
(
out_row0_2
+
1
);
out21
=
neon_vfma_lane_2
(
out21
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
1
,
out21
);
out22
=
vld1q_f32
(
out_row0_2
+
2
);
out22
=
neon_vfma_lane_3
(
out22
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
2
,
out22
);
out00
=
vld1q_f32
(
out_row1_0
+
0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
0
,
out00
);
out01
=
vld1q_f32
(
out_row1_0
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
1
,
out01
);
out02
=
vld1q_f32
(
out_row1_0
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
2
,
out02
);
out10
=
vld1q_f32
(
out_row1_1
+
0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
0
,
out10
);
out11
=
vld1q_f32
(
out_row1_1
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
1
,
out11
);
out12
=
vld1q_f32
(
out_row1_1
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
2
,
out12
);
out20
=
vld1q_f32
(
out_row1_2
+
0
);
out20
=
neon_vfma_lane_1
(
out20
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
0
,
out20
);
out21
=
vld1q_f32
(
out_row1_2
+
1
);
out21
=
neon_vfma_lane_2
(
out21
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
1
,
out21
);
out22
=
vld1q_f32
(
out_row1_2
+
2
);
out22
=
neon_vfma_lane_3
(
out22
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
2
,
out22
);
in
+=
4
;
out_row0_0
+=
4
;
out_row0_1
+=
4
;
out_row0_2
+=
4
;
out_row1_0
+=
4
;
out_row1_1
+=
4
;
out_row1_2
+=
4
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
3
;
++
k
)
{
out_row0_0
[
k
]
+=
val
*
k0_0
[
k
];
out_row0_1
[
k
]
+=
val
*
k0_1
[
k
];
out_row0_2
[
k
]
+=
val
*
k0_2
[
k
+
1
];
out_row1_0
[
k
]
+=
val
*
k1_0
[
k
];
out_row1_1
[
k
]
+=
val
*
k1_1
[
k
];
out_row1_2
[
k
]
+=
val
*
k1_2
[
k
+
1
];
}
in
++
;
out_row0_0
++
;
out_row0_1
++
;
out_row0_2
++
;
out_row1_0
++
;
out_row1_1
++
;
out_row1_2
++
;
}
}
}
}
else
{
float
*
out_base0
=
output
+
(
b
*
outch
+
oc
)
*
outh
*
outw
;
const
float
bias_value0
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base0
,
outh
*
outw
,
bias_value0
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base0
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
const
float
*
in
=
input_base
;
const
float
*
k0_0
=
kernel_base0
;
const
float
*
k0_1
=
kernel_base0
+
3
;
const
float
*
k0_2
=
kernel_base0
+
5
;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t
k00_vec
=
vld1q_f32
(
k0_0
);
float32x4_t
k01_vec
=
vld1q_f32
(
k0_1
);
float32x4_t
k02_vec
=
vld1q_f32
(
k0_2
);
#endif
for
(
index_t
i
=
0
;
i
<
h
;
++
i
)
{
float
*
out_row_base0
=
out_base0
+
i
*
outw
;
float
*
out_row0_0
=
out_row_base0
;
float
*
out_row0_1
=
out_row_base0
+
outw
;
float
*
out_row0_2
=
out_row_base0
+
2
*
outw
;
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
float32x4_t
out00
,
out01
,
out02
;
float32x4_t
out10
,
out11
,
out12
;
float32x4_t
out20
,
out21
,
out22
;
out00
=
vld1q_f32
(
out_row0_0
+
0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
+
0
,
out00
);
out01
=
vld1q_f32
(
out_row0_0
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
+
1
,
out01
);
out02
=
vld1q_f32
(
out_row0_0
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k00_vec
);
vst1q_f32
(
out_row0_0
+
2
,
out02
);
out10
=
vld1q_f32
(
out_row0_1
+
0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
0
,
out10
);
out11
=
vld1q_f32
(
out_row0_1
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
1
,
out11
);
out12
=
vld1q_f32
(
out_row0_1
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k01_vec
);
vst1q_f32
(
out_row0_1
+
2
,
out12
);
out20
=
vld1q_f32
(
out_row0_2
+
0
);
out20
=
neon_vfma_lane_1
(
out20
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
0
,
out20
);
out21
=
vld1q_f32
(
out_row0_2
+
1
);
out21
=
neon_vfma_lane_2
(
out21
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
1
,
out21
);
out22
=
vld1q_f32
(
out_row0_2
+
2
);
out22
=
neon_vfma_lane_3
(
out22
,
in_vec
,
k02_vec
);
vst1q_f32
(
out_row0_2
+
2
,
out22
);
in
+=
4
;
out_row0_0
+=
4
;
out_row0_1
+=
4
;
out_row0_2
+=
4
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
3
;
++
k
)
{
out_row0_0
[
k
]
+=
val
*
k0_0
[
k
];
out_row0_1
[
k
]
+=
val
*
k0_1
[
k
];
out_row0_2
[
k
]
+=
val
*
k0_2
[
k
+
1
];
}
in
++
;
out_row0_0
++
;
out_row0_1
++
;
out_row0_2
++
;
}
}
}
}
}
}
}
void
Deconv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
inch
=
in_shape
[
1
];
const
index_t
h
=
in_shape
[
2
];
const
index_t
w
=
in_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
outh
=
out_shape
[
2
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
outch
;
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
for
(
index_t
ic
=
0
;
ic
<
inch
;
++
ic
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
ic
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
ic
)
*
9
;
const
float
*
in
=
input_base
;
const
float
*
k0
=
kernel_base
;
const
float
*
k1
=
kernel_base
+
3
;
const
float
*
k2
=
kernel_base
+
5
;
#if defined(MACE_ENABLE_NEON)
float32x4_t
k0_vec
=
vld1q_f32
(
k0
);
float32x4_t
k1_vec
=
vld1q_f32
(
k1
);
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
#endif
for
(
index_t
i
=
0
;
i
<
h
;
++
i
)
{
float
*
out_row_base
=
out_base
+
i
*
2
*
outw
;
float
*
out_row_0
=
out_row_base
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
index_t
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// out row 0
float32x4x2_t
out00
=
vld2q_f32
(
out_row_0
);
out00
.
val
[
0
]
=
neon_vfma_lane_0
(
out00
.
val
[
0
],
in_vec
,
k0_vec
);
out00
.
val
[
1
]
=
neon_vfma_lane_1
(
out00
.
val
[
1
],
in_vec
,
k0_vec
);
vst2q_f32
(
out_row_0
,
out00
);
float32x4x2_t
out01
=
vld2q_f32
(
out_row_0
+
2
);
out01
.
val
[
0
]
=
neon_vfma_lane_2
(
out01
.
val
[
0
],
in_vec
,
k0_vec
);
vst2q_f32
(
out_row_0
+
2
,
out01
);
// out row 1
float32x4x2_t
out10
=
vld2q_f32
(
out_row_1
);
out10
.
val
[
0
]
=
neon_vfma_lane_0
(
out10
.
val
[
0
],
in_vec
,
k1_vec
);
out10
.
val
[
1
]
=
neon_vfma_lane_1
(
out10
.
val
[
1
],
in_vec
,
k1_vec
);
vst2q_f32
(
out_row_1
,
out10
);
float32x4x2_t
out11
=
vld2q_f32
(
out_row_1
+
2
);
out11
.
val
[
0
]
=
neon_vfma_lane_2
(
out11
.
val
[
0
],
in_vec
,
k1_vec
);
vst2q_f32
(
out_row_1
+
2
,
out11
);
// out row 2
float32x4x2_t
out20
=
vld2q_f32
(
out_row_2
);
out20
.
val
[
0
]
=
neon_vfma_lane_1
(
out20
.
val
[
0
],
in_vec
,
k2_vec
);
out20
.
val
[
1
]
=
neon_vfma_lane_2
(
out20
.
val
[
1
],
in_vec
,
k2_vec
);
vst2q_f32
(
out_row_2
,
out20
);
float32x4x2_t
out21
=
vld2q_f32
(
out_row_2
+
2
);
out21
.
val
[
0
]
=
neon_vfma_lane_3
(
out21
.
val
[
0
],
in_vec
,
k2_vec
);
vst2q_f32
(
out_row_2
+
2
,
out21
);
in
+=
4
;
out_row_0
+=
8
;
out_row_1
+=
8
;
out_row_2
+=
8
;
}
#endif
for
(;
j
<
w
;
++
j
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
3
;
++
k
)
{
out_row_0
[
k
]
+=
val
*
k0
[
k
];
out_row_1
[
k
]
+=
val
*
k1
[
k
];
out_row_2
[
k
]
+=
val
*
k2
[
k
+
1
];
}
in
++
;
out_row_0
+=
2
;
out_row_1
+=
2
;
out_row_2
+=
2
;
}
}
}
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/arm/deconv_2d_neon_4x4.cc
0 → 100644
浏览文件 @
c2523ee2
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
namespace
mace
{
namespace
kernels
{
void
Deconv2dNeonK4x4S1
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
w
=
in_shape
[
3
];
const
index_t
h
=
in_shape
[
2
];
const
index_t
inch
=
in_shape
[
1
];
const
index_t
outh
=
out_shape
[
2
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
oc
=
0
;
oc
<
outch
;
oc
+=
2
)
{
if
(
oc
+
1
<
outch
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
float
*
out_base1
=
out_base
+
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
const
float
bias_value1
=
bias
?
bias
[
oc
+
1
]
:
0.
f
;
std
::
fill_n
(
out_base1
,
out_img_size
,
bias_value1
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
in
=
input_base
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
q
)
*
16
;
const
float
*
k0
=
kernel_base
;
const
float
*
k1
=
kernel_base
+
4
;
const
float
*
k2
=
kernel_base
+
8
;
const
float
*
k3
=
kernel_base
+
12
;
const
float
*
kernel_base1
=
kernel_base
+
inch
*
16
;
const
float
*
k10
=
kernel_base1
;
const
float
*
k11
=
kernel_base1
+
4
;
const
float
*
k12
=
kernel_base1
+
8
;
const
float
*
k13
=
kernel_base1
+
12
;
#if defined(MACE_ENABLE_NEON)
float32x4_t
k0_vec
=
vld1q_f32
(
k0
);
float32x4_t
k1_vec
=
vld1q_f32
(
k1
);
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
float32x4_t
k10_vec
=
vld1q_f32
(
k10
);
float32x4_t
k11_vec
=
vld1q_f32
(
k11
);
float32x4_t
k12_vec
=
vld1q_f32
(
k12
);
float32x4_t
k13_vec
=
vld1q_f32
(
k13
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
float
*
out_row1
=
out_base1
+
i
*
outw
;
float
*
out_row1_0
=
out_row1
;
float
*
out_row1_1
=
out_row1_0
+
outw
;
float
*
out_row1_2
=
out_row1_1
+
outw
;
float
*
out_row1_3
=
out_row1_2
+
outw
;
int
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
float32x4_t
out00
,
out01
,
out02
,
out03
;
float32x4_t
out10
,
out11
,
out12
,
out13
;
out00
=
vld1q_f32
(
out_row_0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
,
out00
);
out10
=
vld1q_f32
(
out_row1_0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
,
out10
);
out01
=
vld1q_f32
(
out_row_0
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
1
,
out01
);
out11
=
vld1q_f32
(
out_row1_0
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
1
,
out11
);
out02
=
vld1q_f32
(
out_row_0
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
2
,
out02
);
out12
=
vld1q_f32
(
out_row1_0
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
2
,
out12
);
out03
=
vld1q_f32
(
out_row_0
+
3
);
out03
=
neon_vfma_lane_3
(
out03
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
3
,
out03
);
out13
=
vld1q_f32
(
out_row1_0
+
3
);
out13
=
neon_vfma_lane_3
(
out13
,
in_vec
,
k10_vec
);
vst1q_f32
(
out_row1_0
+
3
,
out13
);
//
out00
=
vld1q_f32
(
out_row_1
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
,
out00
);
out10
=
vld1q_f32
(
out_row1_1
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
,
out10
);
out01
=
vld1q_f32
(
out_row_1
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
1
,
out01
);
out11
=
vld1q_f32
(
out_row1_1
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
1
,
out11
);
out02
=
vld1q_f32
(
out_row_1
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
2
,
out02
);
out12
=
vld1q_f32
(
out_row1_1
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
2
,
out12
);
out03
=
vld1q_f32
(
out_row_1
+
3
);
out03
=
neon_vfma_lane_3
(
out03
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
3
,
out03
);
out13
=
vld1q_f32
(
out_row1_1
+
3
);
out13
=
neon_vfma_lane_3
(
out13
,
in_vec
,
k11_vec
);
vst1q_f32
(
out_row1_1
+
3
,
out13
);
//
out00
=
vld1q_f32
(
out_row_2
+
0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
0
,
out00
);
out10
=
vld1q_f32
(
out_row1_2
+
0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
0
,
out10
);
out01
=
vld1q_f32
(
out_row_2
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
1
,
out01
);
out11
=
vld1q_f32
(
out_row1_2
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
1
,
out11
);
out02
=
vld1q_f32
(
out_row_2
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
2
,
out02
);
out12
=
vld1q_f32
(
out_row1_2
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
2
,
out12
);
out03
=
vld1q_f32
(
out_row_2
+
3
);
out03
=
neon_vfma_lane_3
(
out03
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
3
,
out03
);
out13
=
vld1q_f32
(
out_row1_2
+
3
);
out13
=
neon_vfma_lane_3
(
out13
,
in_vec
,
k12_vec
);
vst1q_f32
(
out_row1_2
+
3
,
out13
);
//
out00
=
vld1q_f32
(
out_row_3
+
0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
0
,
out00
);
out10
=
vld1q_f32
(
out_row1_3
+
0
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k13_vec
);
vst1q_f32
(
out_row1_3
+
0
,
out10
);
out01
=
vld1q_f32
(
out_row_3
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
1
,
out01
);
out11
=
vld1q_f32
(
out_row1_3
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k13_vec
);
vst1q_f32
(
out_row1_3
+
1
,
out11
);
out02
=
vld1q_f32
(
out_row_3
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
2
,
out02
);
out12
=
vld1q_f32
(
out_row1_3
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k13_vec
);
vst1q_f32
(
out_row1_3
+
2
,
out12
);
out03
=
vld1q_f32
(
out_row_3
+
3
);
out03
=
neon_vfma_lane_3
(
out03
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
3
,
out03
);
out13
=
vld1q_f32
(
out_row1_3
+
3
);
out13
=
neon_vfma_lane_3
(
out13
,
in_vec
,
k13_vec
);
vst1q_f32
(
out_row1_3
+
3
,
out13
);
in
+=
4
;
out_row_0
+=
4
;
out_row_1
+=
4
;
out_row_2
+=
4
;
out_row_3
+=
4
;
out_row1_0
+=
4
;
out_row1_1
+=
4
;
out_row1_2
+=
4
;
out_row1_3
+=
4
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
out_row_0
[
k
]
+=
val
*
k0
[
k
];
out_row_1
[
k
]
+=
val
*
k1
[
k
];
out_row_2
[
k
]
+=
val
*
k2
[
k
];
out_row_3
[
k
]
+=
val
*
k3
[
k
];
out_row1_0
[
k
]
+=
val
*
k10
[
k
];
out_row1_1
[
k
]
+=
val
*
k11
[
k
];
out_row1_2
[
k
]
+=
val
*
k12
[
k
];
out_row1_3
[
k
]
+=
val
*
k13
[
k
];
}
in
++
;
out_row_0
++
;
out_row_1
++
;
out_row_2
++
;
out_row_3
++
;
out_row1_0
++
;
out_row1_1
++
;
out_row1_2
++
;
out_row1_3
++
;
}
}
}
}
else
{
float
*
out_base
=
output
+
(
b
*
outch
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
oc
*
inch
+
q
)
*
16
;
const
float
*
in
=
input_base
;
const
float
*
k0
=
kernel_base
;
const
float
*
k1
=
kernel_base
+
4
;
const
float
*
k2
=
kernel_base
+
8
;
const
float
*
k3
=
kernel_base
+
12
;
#if defined(MACE_ENABLE_NEON)
float32x4_t
k0_vec
=
vld1q_f32
(
k0
);
float32x4_t
k1_vec
=
vld1q_f32
(
k1
);
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
float32x4_t
out00
=
vld1q_f32
(
out_row_0
);
out00
=
neon_vfma_lane_0
(
out00
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
,
out00
);
float32x4_t
out01
=
vld1q_f32
(
out_row_0
+
1
);
out01
=
neon_vfma_lane_1
(
out01
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
1
,
out01
);
float32x4_t
out02
=
vld1q_f32
(
out_row_0
+
2
);
out02
=
neon_vfma_lane_2
(
out02
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
2
,
out02
);
float32x4_t
out03
=
vld1q_f32
(
out_row_0
+
3
);
out03
=
neon_vfma_lane_3
(
out03
,
in_vec
,
k0_vec
);
vst1q_f32
(
out_row_0
+
3
,
out03
);
//
float32x4_t
out10
=
vld1q_f32
(
out_row_1
);
out10
=
neon_vfma_lane_0
(
out10
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
,
out10
);
float32x4_t
out11
=
vld1q_f32
(
out_row_1
+
1
);
out11
=
neon_vfma_lane_1
(
out11
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
1
,
out11
);
float32x4_t
out12
=
vld1q_f32
(
out_row_1
+
2
);
out12
=
neon_vfma_lane_2
(
out12
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
2
,
out12
);
float32x4_t
out13
=
vld1q_f32
(
out_row_1
+
3
);
out13
=
neon_vfma_lane_3
(
out13
,
in_vec
,
k1_vec
);
vst1q_f32
(
out_row_1
+
3
,
out13
);
//
float32x4_t
out20
=
vld1q_f32
(
out_row_2
+
0
);
out20
=
neon_vfma_lane_0
(
out20
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
0
,
out20
);
float32x4_t
out21
=
vld1q_f32
(
out_row_2
+
1
);
out21
=
neon_vfma_lane_1
(
out21
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
1
,
out21
);
float32x4_t
out22
=
vld1q_f32
(
out_row_2
+
2
);
out22
=
neon_vfma_lane_2
(
out22
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
2
,
out22
);
float32x4_t
out23
=
vld1q_f32
(
out_row_2
+
3
);
out23
=
neon_vfma_lane_3
(
out23
,
in_vec
,
k2_vec
);
vst1q_f32
(
out_row_2
+
3
,
out23
);
//
float32x4_t
out30
=
vld1q_f32
(
out_row_3
+
0
);
out30
=
neon_vfma_lane_0
(
out30
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
0
,
out30
);
float32x4_t
out31
=
vld1q_f32
(
out_row_3
+
1
);
out31
=
neon_vfma_lane_1
(
out31
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
1
,
out31
);
float32x4_t
out32
=
vld1q_f32
(
out_row_3
+
2
);
out32
=
neon_vfma_lane_2
(
out32
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
2
,
out32
);
float32x4_t
out33
=
vld1q_f32
(
out_row_3
+
3
);
out33
=
neon_vfma_lane_3
(
out33
,
in_vec
,
k3_vec
);
vst1q_f32
(
out_row_3
+
3
,
out33
);
in
+=
4
;
out_row_0
+=
4
;
out_row_1
+=
4
;
out_row_2
+=
4
;
out_row_3
+=
4
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
out_row_0
[
k
]
+=
val
*
k0
[
k
];
out_row_1
[
k
]
+=
val
*
k1
[
k
];
out_row_2
[
k
]
+=
val
*
k2
[
k
];
out_row_3
[
k
]
+=
val
*
k3
[
k
];
}
in
++
;
out_row_0
++
;
out_row_1
++
;
out_row_2
++
;
out_row_3
++
;
}
}
}
}
}
}
}
void
Deconv2dNeonK4x4S2
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
w
=
in_shape
[
3
];
const
index_t
h
=
in_shape
[
2
];
const
index_t
inch
=
in_shape
[
1
];
const
index_t
outh
=
out_shape
[
2
];
const
index_t
outw
=
out_shape
[
3
];
const
index_t
outch
=
out_shape
[
1
];
const
index_t
out_img_size
=
outh
*
outw
;
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
float
*
out_base
=
output
+
(
b
*
outch
+
p
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
p
]
:
0.
f
;
std
::
fill_n
(
out_base
,
outh
*
outw
,
bias_value
);
for
(
int
q
=
0
;
q
<
inch
;
q
++
)
{
const
float
*
input_base
=
input
+
(
b
*
inch
+
q
)
*
h
*
w
;
const
float
*
kernel_base
=
filter
+
(
p
*
inch
+
q
)
*
16
;
const
float
*
in
=
input_base
;
const
float
*
k0
=
kernel_base
;
const
float
*
k1
=
kernel_base
+
4
;
const
float
*
k2
=
kernel_base
+
8
;
const
float
*
k3
=
kernel_base
+
12
;
#if defined(MACE_ENABLE_NEON)
float32x4_t
k0_vec
=
vld1q_f32
(
k0
);
float32x4_t
k1_vec
=
vld1q_f32
(
k1
);
float32x4_t
k2_vec
=
vld1q_f32
(
k2
);
float32x4_t
k3_vec
=
vld1q_f32
(
k3
);
#endif
for
(
int
i
=
0
;
i
<
h
;
i
++
)
{
float
*
out_row
=
out_base
+
2
*
i
*
outw
;
float
*
out_row_0
=
out_row
;
float
*
out_row_1
=
out_row_0
+
outw
;
float
*
out_row_2
=
out_row_1
+
outw
;
float
*
out_row_3
=
out_row_2
+
outw
;
int
j
=
0
;
#if defined(MACE_ENABLE_NEON)
for
(;
j
+
3
<
w
;
j
+=
4
)
{
float32x4_t
in_vec
=
vld1q_f32
(
in
);
// row 0
float32x4x2_t
out0
=
vld2q_f32
(
out_row_0
);
out0
.
val
[
0
]
=
neon_vfma_lane_0
(
out0
.
val
[
0
],
in_vec
,
k0_vec
);
out0
.
val
[
1
]
=
neon_vfma_lane_1
(
out0
.
val
[
1
],
in_vec
,
k0_vec
);
vst2q_f32
(
out_row_0
,
out0
);
out0
=
vld2q_f32
(
out_row_0
+
2
);
out0
.
val
[
0
]
=
neon_vfma_lane_2
(
out0
.
val
[
0
],
in_vec
,
k0_vec
);
out0
.
val
[
1
]
=
neon_vfma_lane_3
(
out0
.
val
[
1
],
in_vec
,
k0_vec
);
vst2q_f32
(
out_row_0
+
2
,
out0
);
// row 1
float32x4x2_t
out1
=
vld2q_f32
(
out_row_1
);
out1
.
val
[
0
]
=
neon_vfma_lane_0
(
out1
.
val
[
0
],
in_vec
,
k1_vec
);
out1
.
val
[
1
]
=
neon_vfma_lane_1
(
out1
.
val
[
1
],
in_vec
,
k1_vec
);
vst2q_f32
(
out_row_1
,
out1
);
out1
=
vld2q_f32
(
out_row_1
+
2
);
out1
.
val
[
0
]
=
neon_vfma_lane_2
(
out1
.
val
[
0
],
in_vec
,
k1_vec
);
out1
.
val
[
1
]
=
neon_vfma_lane_3
(
out1
.
val
[
1
],
in_vec
,
k1_vec
);
vst2q_f32
(
out_row_1
+
2
,
out1
);
// row 2
float32x4x2_t
out2
=
vld2q_f32
(
out_row_2
);
out2
.
val
[
0
]
=
neon_vfma_lane_0
(
out2
.
val
[
0
],
in_vec
,
k2_vec
);
out2
.
val
[
1
]
=
neon_vfma_lane_1
(
out2
.
val
[
1
],
in_vec
,
k2_vec
);
vst2q_f32
(
out_row_2
,
out2
);
out2
=
vld2q_f32
(
out_row_2
+
2
);
out2
.
val
[
0
]
=
neon_vfma_lane_2
(
out2
.
val
[
0
],
in_vec
,
k2_vec
);
out2
.
val
[
1
]
=
neon_vfma_lane_3
(
out2
.
val
[
1
],
in_vec
,
k2_vec
);
vst2q_f32
(
out_row_2
+
2
,
out2
);
// row 3
float32x4x2_t
out3
=
vld2q_f32
(
out_row_3
);
out3
.
val
[
0
]
=
neon_vfma_lane_0
(
out3
.
val
[
0
],
in_vec
,
k3_vec
);
out3
.
val
[
1
]
=
neon_vfma_lane_1
(
out3
.
val
[
1
],
in_vec
,
k3_vec
);
vst2q_f32
(
out_row_3
,
out3
);
out3
=
vld2q_f32
(
out_row_3
+
2
);
out3
.
val
[
0
]
=
neon_vfma_lane_2
(
out3
.
val
[
0
],
in_vec
,
k3_vec
);
out3
.
val
[
1
]
=
neon_vfma_lane_3
(
out3
.
val
[
1
],
in_vec
,
k3_vec
);
vst2q_f32
(
out_row_3
+
2
,
out3
);
in
+=
4
;
out_row_0
+=
8
;
out_row_1
+=
8
;
out_row_2
+=
8
;
out_row_3
+=
8
;
}
#endif
for
(;
j
<
w
;
j
++
)
{
float
val
=
in
[
0
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
out_row_0
[
k
]
+=
val
*
k0
[
k
];
out_row_1
[
k
]
+=
val
*
k1
[
k
];
out_row_2
[
k
]
+=
val
*
k2
[
k
];
out_row_3
[
k
]
+=
val
*
k3
[
k
];
}
in
++
;
out_row_0
+=
2
;
out_row_1
+=
2
;
out_row_2
+=
2
;
out_row_3
+=
2
;
}
}
}
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/deconv_2d.h
浏览文件 @
c2523ee2
...
...
@@ -15,76 +15,25 @@
#ifndef MACE_KERNELS_DECONV_2D_H_
#define MACE_KERNELS_DECONV_2D_H_
#if defined(MACE_ENABLE_NEON)
&& defined(__aarch64__)
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <functional>
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
namespace
deconv
{
template
<
typename
T
>
void
Deconv2dNCHW
(
const
T
*
input
,
const
T
*
filter
,
const
T
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
index_t
*
kernel_hw
,
const
int
*
strides
,
const
int
*
padding
,
float
*
output
)
{
#pragma omp parallel for collapse(4)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
out_shape
[
1
];
++
oc
)
{
for
(
index_t
oh
=
0
;
oh
<
out_shape
[
2
];
++
oh
)
{
for
(
index_t
ow
=
0
;
ow
<
out_shape
[
3
];
++
ow
)
{
index_t
filter_start_y
,
filter_start_x
;
index_t
start_x
=
std
::
max
<
int
>
(
0
,
ow
+
strides
[
1
]
-
1
-
padding
[
1
]);
index_t
start_y
=
std
::
max
<
int
>
(
0
,
oh
+
strides
[
0
]
-
1
-
padding
[
0
]);
start_x
/=
strides
[
1
];
start_y
/=
strides
[
0
];
filter_start_x
=
padding
[
1
]
+
strides
[
1
]
*
start_x
-
ow
;
filter_start_y
=
padding
[
0
]
+
strides
[
0
]
*
start_y
-
oh
;
filter_start_x
=
kernel_hw
[
1
]
-
1
-
filter_start_x
;
filter_start_y
=
kernel_hw
[
0
]
-
1
-
filter_start_y
;
T
out_value
=
0
;
index_t
out_pos
=
((
b
*
out_shape
[
1
]
+
oc
)
*
out_shape
[
2
]
+
oh
)
*
out_shape
[
3
]
+
ow
;
for
(
index_t
ic
=
0
;
ic
<
in_shape
[
1
];
++
ic
)
{
for
(
index_t
f_y
=
filter_start_y
,
ih
=
start_y
;
f_y
>=
0
&&
ih
<
in_shape
[
2
];
f_y
-=
strides
[
0
],
++
ih
)
{
for
(
index_t
f_x
=
filter_start_x
,
iw
=
start_x
;
f_x
>=
0
&&
iw
<
in_shape
[
3
];
f_x
-=
strides
[
1
],
++
iw
)
{
index_t
weight_pos
=
((
oc
*
in_shape
[
1
]
+
ic
)
*
kernel_hw
[
0
]
+
f_y
)
*
kernel_hw
[
1
]
+
f_x
;
index_t
in_pos
=
((
b
*
in_shape
[
1
]
+
ic
)
*
in_shape
[
2
]
+
ih
)
*
in_shape
[
3
]
+
iw
;
out_value
+=
input
[
in_pos
]
*
filter
[
weight_pos
];
}
}
}
if
(
bias
!=
nullptr
)
out_value
+=
bias
[
oc
];
output
[
out_pos
]
=
out_value
;
}
}
}
}
}
}
// namespace deconv
struct
Deconv2dFunctorBase
:
OpKernel
{
Deconv2dFunctorBase
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
strides
,
...
...
@@ -107,6 +56,7 @@ struct Deconv2dFunctorBase : OpKernel {
const
int
*
strides
,
index_t
*
output_shape
,
const
int
*
padding_size
,
int
*
input_padding
,
const
bool
isNCHW
=
false
)
{
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
...
@@ -119,13 +69,18 @@ struct Deconv2dFunctorBase : OpKernel {
const
index_t
in_height
=
isNCHW
?
input_shape
[
2
]
:
input_shape
[
1
];
const
index_t
in_width
=
isNCHW
?
input_shape
[
3
]
:
input_shape
[
2
];
const
index_t
filter_h
=
filter_shape
[
2
];
const
index_t
filter_w
=
filter_shape
[
3
];
const
index_t
kernel_h
=
filter_shape
[
2
];
const
index_t
kernel_w
=
filter_shape
[
3
];
input_padding
[
0
]
=
static_cast
<
int
>
((
kernel_h
-
1
)
*
2
-
padding_size
[
0
]);
input_padding
[
1
]
=
static_cast
<
int
>
((
kernel_w
-
1
)
*
2
-
padding_size
[
1
]);
input_padding
[
0
]
=
std
::
max
<
int
>
(
0
,
input_padding
[
0
]);
input_padding
[
1
]
=
std
::
max
<
int
>
(
0
,
input_padding
[
1
]);
index_t
out_height
=
(
in_height
-
1
)
*
strides
[
0
]
+
filter_h
-
padding_size
[
0
];
(
in_height
-
1
)
*
strides
[
0
]
+
kernel_h
-
padding_size
[
0
];
index_t
out_width
=
(
in_width
-
1
)
*
strides
[
1
]
+
filter_w
-
padding_size
[
1
];
(
in_width
-
1
)
*
strides
[
1
]
+
kernel_w
-
padding_size
[
1
];
output_shape
[
0
]
=
input_shape
[
0
];
if
(
isNCHW
)
{
...
...
@@ -206,8 +161,12 @@ struct Deconv2dFunctorBase : OpKernel {
const
float
relux_max_limit_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
Deconv2dFunctor
:
Deconv2dFunctorBase
{
template
<
DeviceType
D
,
typename
T
>
struct
Deconv2dFunctor
;
template
<
>
struct
Deconv2dFunctor
<
DeviceType
::
CPU
,
float
>:
Deconv2dFunctorBase
{
Deconv2dFunctor
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
strides
,
const
Padding
&
padding_type
,
...
...
@@ -223,6 +182,92 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
activation
,
relux_max_limit
)
{}
void
Deconv2dGeneral
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
kernel_h
,
const
index_t
kernel_w
,
const
int
*
strides
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
kernel_size
=
kernel_h
*
kernel_w
;
std
::
vector
<
int
>
out_map
(
kernel_size
);
int
p0
=
0
;
int
p1
=
0
;
index_t
gap
=
out_shape
[
3
]
-
kernel_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
out_map
[
p0
]
=
p1
;
p0
++
;
p1
++
;
}
p1
+=
gap
;
}
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_img_size
=
out_height
*
out_width
;
const
index_t
in_img_size
=
in_height
*
in_width
;
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
int
oc
=
0
;
oc
<
out_shape
[
1
];
++
oc
)
{
float
*
out_base
=
output
+
(
b
*
out_shape
[
1
]
+
oc
)
*
out_img_size
;
const
float
bias_value
=
bias
?
bias
[
oc
]
:
0.
f
;
std
::
fill_n
(
out_base
,
out_img_size
,
bias_value
);
for
(
int
i
=
0
;
i
<
in_height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
in_width
;
++
j
)
{
const
index_t
out_offset
=
i
*
strides
[
0
]
*
out_width
+
j
*
strides
[
1
];
for
(
int
ic
=
0
;
ic
<
in_shape
[
1
];
++
ic
)
{
const
index_t
input_idx
=
(
b
*
in_shape
[
1
]
+
ic
)
*
in_img_size
+
i
*
in_width
+
j
;
const
float
val
=
input
[
input_idx
];
const
index_t
kernel_offset
=
(
oc
*
in_shape
[
1
]
+
ic
)
*
kernel_size
;
for
(
int
k
=
0
;
k
<
kernel_size
;
++
k
)
{
const
index_t
out_idx
=
out_offset
+
out_map
[
k
];
const
index_t
kernel_idx
=
kernel_offset
+
k
;
out_base
[
out_idx
]
+=
val
*
filter
[
kernel_idx
];
}
}
}
}
}
}
}
void
CropPadOut
(
const
float
*
input
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
index_t
pad_h
,
const
index_t
pad_w
,
float
*
output
)
{
const
index_t
batch
=
in_shape
[
0
];
const
index_t
channel
=
in_shape
[
1
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
for
(
int
k
=
0
;
k
<
out_height
;
++
k
)
{
const
float
*
input_base
=
input
+
((
i
*
channel
+
j
)
*
in_height
+
(
k
+
pad_h
))
*
in_width
;
float
*
output_base
=
output
+
((
i
*
channel
+
j
)
*
out_height
+
k
)
*
out_width
;
memcpy
(
output_base
,
input_base
+
pad_w
,
out_width
*
sizeof
(
float
));
}
}
}
}
MaceStatus
operator
()(
const
Tensor
*
input
,
// NCHW
const
Tensor
*
filter
,
// OIHW
const
Tensor
*
bias
,
...
...
@@ -235,6 +280,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
MACE_CHECK_NOTNULL
(
output
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
int
>
out_paddings
(
2
);
std
::
vector
<
index_t
>
output_shape
(
4
);
if
(
paddings_
.
empty
())
{
// tensorflow
paddings
=
std
::
vector
<
int
>
(
2
,
0
);
...
...
@@ -261,19 +307,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
output_shape
.
data
(),
paddings
.
data
(),
true
);
}
else
{
// caffe
paddings
=
paddings_
;
out_
paddings
=
paddings_
;
output_shape
=
std
::
vector
<
index_t
>
(
4
,
0
);
CalcDeconvOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
.
data
(),
output_shape
.
data
(),
paddings
.
data
(),
true
);
out_paddings
.
data
(),
paddings
.
data
(),
true
);
}
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
const
index_t
*
in_shape
=
input
->
shape
().
data
();
const
index_t
kernel_hw
[
2
]
=
{
kernel_h
,
kernel_w
};
MACE_CHECK
(
filter
->
dim
(
0
)
==
output_shape
[
1
],
filter
->
dim
(
0
),
" != "
,
output_shape
[
1
]);
...
...
@@ -281,28 +328,144 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
in_shape
[
1
]);
MACE_CHECK
(
in_shape
[
0
]
==
output_shape
[
0
],
"Input/Output batch size mismatch"
);
std
::
function
<
void
(
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
>
deconv_func
;
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
Tensor
::
MappingGuard
bias_mapper
(
bias
);
Tensor
::
MappingGuard
output_mapper
(
output
);
auto
input_data
=
input
->
data
<
T
>
();
auto
filter_data
=
filter
->
data
<
T
>
();
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
auto
output_data
=
output
->
mutable_data
<
T
>
();
int
padding
[
2
];
padding
[
0
]
=
(
paddings
[
0
]
+
1
)
>>
1
;
padding
[
1
]
=
(
paddings
[
1
]
+
1
)
>>
1
;
deconv
::
Deconv2dNCHW
(
input_data
,
filter_data
,
bias_data
,
in_shape
,
output_shape
.
data
(),
kernel_hw
,
strides_
.
data
(),
padding
,
output_data
);
DoActivation
(
output_data
,
auto
input_data
=
input
->
data
<
float
>
();
auto
filter_data
=
filter
->
data
<
float
>
();
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
float
>
();
auto
output_data
=
output
->
mutable_data
<
float
>
();
const
index_t
padded_out_h
=
(
in_shape
[
2
]
-
1
)
*
strides_
[
0
]
+
kernel_h
;
const
index_t
padded_out_w
=
(
in_shape
[
3
]
-
1
)
*
strides_
[
1
]
+
kernel_w
;
const
index_t
pad_h
=
(
padded_out_h
-
output_shape
[
2
])
/
2
;
const
index_t
pad_w
=
(
padded_out_w
-
output_shape
[
3
])
/
2
;
std
::
vector
<
index_t
>
padded_out_shape
({
output_shape
[
0
],
output_shape
[
1
],
padded_out_h
,
padded_out_w
});
index_t
padded_out_size
=
std
::
accumulate
(
padded_out_shape
.
begin
(),
padded_out_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
())
*
sizeof
(
float
);
ScratchBuffer
*
scratch
=
context_
->
device
()
->
scratch_buffer
();
scratch
->
Rewind
();
scratch
->
GrowSize
(
padded_out_size
);
Tensor
padded_out
(
scratch
->
Scratch
(
padded_out_size
),
DT_FLOAT
);
auto
*
padded_out_data
=
padded_out
.
mutable_data
<
float
>
();
bool
use_neon_3x3_s1
=
kernel_h
==
kernel_w
&&
kernel_h
==
3
&&
strides_
[
0
]
==
strides_
[
1
]
&&
strides_
[
0
]
==
1
;
bool
use_neon_3x3_s2
=
kernel_h
==
kernel_w
&&
kernel_h
==
3
&&
strides_
[
0
]
==
strides_
[
1
]
&&
strides_
[
0
]
==
2
;
bool
use_neon_4x4_s1
=
kernel_h
==
kernel_w
&&
kernel_h
==
4
&&
strides_
[
0
]
==
strides_
[
1
]
&&
strides_
[
0
]
==
1
;
bool
use_neon_4x4_s2
=
kernel_h
==
kernel_w
&&
kernel_h
==
4
&&
strides_
[
0
]
==
strides_
[
1
]
&&
strides_
[
0
]
==
2
;
if
(
use_neon_3x3_s1
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK3x3S1
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
};
}
else
if
(
use_neon_3x3_s2
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK3x3S2
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
};
}
else
if
(
use_neon_4x4_s1
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK4x4S1
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
};
}
else
if
(
use_neon_4x4_s2
)
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dNeonK4x4S2
(
input
,
filter
,
bias
,
in_shape
,
padded_out_shape
,
padded_output
);
};
}
else
{
deconv_func
=
[
=
](
const
float
*
input
,
const
float
*
filter
,
const
float
*
bias
,
const
index_t
*
in_shape
,
const
index_t
*
padded_out_shape
,
float
*
padded_output
)
{
Deconv2dGeneral
(
input
,
filter
,
bias
,
kernel_h
,
kernel_w
,
strides_
.
data
(),
in_shape
,
padded_out_shape
,
padded_output
);
};
}
bool
no_pad
=
padded_out_h
==
output_shape
[
2
]
&&
padded_out_w
==
output_shape
[
3
];
float
*
out_data
=
no_pad
?
output_data
:
padded_out_data
;
deconv_func
(
input_data
,
filter_data
,
bias_data
,
in_shape
,
padded_out_shape
.
data
(),
out_data
);
if
(
!
no_pad
)
{
CropPadOut
(
out_data
,
padded_out_shape
.
data
(),
output_shape
.
data
(),
pad_h
,
pad_w
,
output_data
);
}
DoActivation
<
float
>
(
output_data
,
output_data
,
output
->
size
(),
activation_
,
...
...
mace/kernels/opencl/deconv_2d.cc
浏览文件 @
c2523ee2
...
...
@@ -53,6 +53,7 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK_NOTNULL
(
filter
);
MACE_CHECK_NOTNULL
(
output
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
int
>
out_paddings
(
2
);
std
::
vector
<
index_t
>
output_shape
(
4
);
if
(
paddings_
.
empty
())
{
paddings
=
std
::
vector
<
int
>
(
2
,
0
);
...
...
@@ -74,12 +75,14 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
output_shape
.
data
(),
paddings
.
data
());
}
else
{
paddings
=
paddings_
;
out_paddings
=
paddings_
;
paddings
=
std
::
vector
<
int
>
(
2
,
0
);
output_shape
=
std
::
vector
<
index_t
>
(
4
,
0
);
CalcDeconvOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
.
data
(),
output_shape
.
data
(),
out_paddings
.
data
(),
paddings
.
data
());
}
...
...
mace/ops/deconv_2d_benchmark.cc
浏览文件 @
c2523ee2
...
...
@@ -116,6 +116,7 @@ static void Deconv2d(int iters,
// TODO(liutuo): add cpu benchmark when optimized.
#define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, CPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
...
...
@@ -124,6 +125,11 @@ MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
MACE_BM_DECONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
62
,
62
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
60
,
60
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
128
,
60
,
60
,
4
,
4
,
1
,
63
,
63
,
VALID
,
128
);
MACE_BM_DECONV_2D
(
1
,
32
,
60
,
60
,
4
,
4
,
1
,
60
,
60
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
4
,
4
,
2
,
448
,
448
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
4
,
4
,
2
,
450
,
450
,
VALID
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
1023
,
1023
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
128
,
16
,
16
,
5
,
5
,
1
,
20
,
20
,
VALID
,
32
);
MACE_BM_DECONV_2D
(
1
,
128
,
64
,
64
,
5
,
5
,
1
,
68
,
68
,
VALID
,
32
);
...
...
@@ -134,6 +140,7 @@ MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
MACE_BM_DECONV_2D
(
1
,
64
,
33
,
32
,
3
,
3
,
2
,
65
,
63
,
SAME
,
128
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
447
,
447
,
SAME
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
449
,
449
,
VALID
,
32
);
MACE_BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
448
,
448
,
SAME
,
32
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/deconv_2d_test.cc
浏览文件 @
c2523ee2
...
...
@@ -380,11 +380,11 @@ void TestComplexDeconvNxNS12(const int batch,
1e-4
);
};
for
(
int
kernel_size
:
{
1
,
3
,
5
,
7
})
{
for
(
int
kernel_size
:
{
3
,
4
,
5
,
7
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
,
-
1
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
,
-
1
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
,
1
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
,
2
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
,
3
);
}
}
}
// namespace
...
...
@@ -410,8 +410,8 @@ TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
}
TEST_F
(
Deconv2dOpTest
,
OPENCLUnalignedDeconvNxNMultiBatch
)
{
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
3
,
{
17
,
13
,
5
,
7
},
1
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
5
,
{
17
,
13
,
5
,
7
},
2
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
3
,
{
17
,
1
1
3
,
5
,
7
},
1
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
5
,
{
17
,
1
1
3
,
5
,
7
},
2
);
}
}
// namespace test
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录