Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
1fcd812b
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1fcd812b
编写于
5月 16, 2018
作者:
L
liutuo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix ndk-r15c openmp bugs
上级
3b11a1b2
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
371 addition
and
507 deletion
+371
-507
mace/core/runtime/cpu/cpu_runtime.cc
mace/core/runtime/cpu/cpu_runtime.cc
+1
-0
mace/kernels/arm/conv_2d_neon.h
mace/kernels/arm/conv_2d_neon.h
+12
-42
mace/kernels/arm/conv_2d_neon_3x3.cc
mace/kernels/arm/conv_2d_neon_3x3.cc
+26
-27
mace/kernels/arm/conv_2d_neon_5x5.cc
mace/kernels/arm/conv_2d_neon_5x5.cc
+13
-13
mace/kernels/arm/conv_2d_neon_7x7.cc
mace/kernels/arm/conv_2d_neon_7x7.cc
+39
-39
mace/kernels/arm/depthwise_conv2d_neon.h
mace/kernels/arm/depthwise_conv2d_neon.h
+6
-18
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+38
-41
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+62
-96
mace/kernels/deconv_2d.h
mace/kernels/deconv_2d.h
+49
-60
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+19
-19
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+37
-55
mace/kernels/image_to_buffer.h
mace/kernels/image_to_buffer.h
+4
-0
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+2
-1
mace/kernels/pooling.h
mace/kernels/pooling.h
+58
-95
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+1
-0
mace/ops/activation.h
mace/ops/activation.h
+1
-1
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+2
-0
mace/utils/logging.cc
mace/utils/logging.cc
+1
-0
未找到文件。
mace/core/runtime/cpu/cpu_runtime.cc
浏览文件 @
1fcd812b
...
...
@@ -18,6 +18,7 @@
#include <omp.h>
#endif
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
...
...
mace/kernels/arm/conv_2d_neon.h
浏览文件 @
1fcd812b
...
...
@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,
extern
void
Conv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK5x5S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK7x7S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK7x7S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK7x7S3
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
}
// namespace kernels
...
...
mace/kernels/arm/conv_2d_neon_3x3.cc
浏览文件 @
1fcd812b
...
...
@@ -24,22 +24,22 @@ namespace kernels {
// Ho = 2, Wo = 4, Co = 2
void
Conv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
2
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
2
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
1
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
...
...
@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,
void
Conv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
c
=
0
;
c
<
in_shape
[
1
];
++
c
)
{
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
float
*
in_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
m
*
in_channels
*
9
+
c
*
9
;
...
...
mace/kernels/arm/conv_2d_neon_5x5.cc
浏览文件 @
1fcd812b
...
...
@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4
void
Conv2dNeonK5x5S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
4
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
...
...
mace/kernels/arm/conv_2d_neon_7x7.cc
浏览文件 @
1fcd812b
...
...
@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4
void
Conv2dNeonK7x7S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
4
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
...
...
@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
// Ho = 1, Wo = 4, Co = 4
void
Conv2dNeonK7x7S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
4
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
...
...
@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
// Ho = 1, Wo = 4, Co = 4
void
Conv2dNeonK7x7S3
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
4
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
m
+=
4
)
{
const
index_t
out_channels
=
out_shape
[
1
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
...
...
mace/kernels/arm/depthwise_conv2d_neon.h
浏览文件 @
1fcd812b
...
...
@@ -22,15 +22,9 @@ namespace kernels {
void
DepthwiseConv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void
DepthwiseConv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
浏览文件 @
1fcd812b
...
...
@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1
void
DepthwiseConv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
MACE_UNUSED
(
valid_w_start
);
MACE_UNUSED
(
valid_w_stop
);
#endif
const
index_t
multiplier
=
out_
channels
/
in_channels
;
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
multiplier
=
out_
shape
[
1
]
/
in_shape
[
1
]
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_
channels
;
++
m
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
]
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_
shape
[
1
]
;
++
m
)
{
index_t
c
=
m
/
multiplier
;
index_t
multi_index
=
m
%
multiplier
;
const
float
*
in_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
multi_index
*
in_
channels
*
9
+
c
*
9
;
const
float
*
filter_ptr
=
filter
+
multi_index
*
in_
shape
[
1
]
*
9
+
c
*
9
;
float
*
out_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
index_t
h
,
w
;
const
index_t
pad_top
=
pad_hw
[
0
];
const
index_t
pad_left
=
pad_hw
[
1
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
// top
for
(
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
for
(
w
=
0
;
w
<
out_
width
;
++
w
)
{
for
(
w
=
0
;
w
<
out_
shape
[
3
]
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
...
...
@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
}
// h
#else
for
(
index_t
ih
=
valid_h_start
;
ih
<
valid_h_stop
;
++
ih
)
{
for
(
index_t
iw
=
0
;
iw
<
out_
width
;
++
iw
)
{
for
(
index_t
iw
=
0
;
iw
<
out_
shape
[
3
]
;
++
iw
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
ih
,
...
...
@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
#endif
// bottom
for
(;
h
<
out_
height
;
++
h
)
{
for
(
w
=
0
;
w
<
out_
width
;
++
w
)
{
for
(;
h
<
out_
shape
[
2
]
;
++
h
)
{
for
(
w
=
0
;
w
<
out_
shape
[
3
]
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
...
...
@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void
DepthwiseConv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
MACE_UNUSED
(
valid_w_start
);
MACE_UNUSED
(
valid_w_stop
);
#endif
const
index_t
multiplier
=
out_
channels
/
in_channels
;
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_
channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_
channels
*
out_image_size
;
const
index_t
multiplier
=
out_
shape
[
1
]
/
in_shape
[
1
]
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_
shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_
shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_
channels
;
++
m
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
]
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_
shape
[
1
]
;
++
m
)
{
index_t
c
=
m
/
multiplier
;
index_t
multi_index
=
m
%
multiplier
;
const
float
*
in_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
multi_index
*
in_
channels
*
9
+
c
*
9
;
const
float
*
filter_ptr
=
filter
+
multi_index
*
in_
shape
[
1
]
*
9
+
c
*
9
;
float
*
out_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
index_t
h
,
w
;
const
index_t
pad_top
=
pad_hw
[
0
];
const
index_t
pad_left
=
pad_hw
[
1
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
// top
for
(
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
for
(
w
=
0
;
w
<
out_width
;
++
w
)
{
...
...
@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
#endif
// bottom
for
(;
h
<
out_
height
;
++
h
)
{
for
(
w
=
0
;
w
<
out_
width
;
++
w
)
{
for
(;
h
<
out_
shape
[
2
]
;
++
h
)
{
for
(
w
=
0
;
w
<
out_
shape
[
3
]
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
...
...
mace/kernels/conv_2d.h
浏览文件 @
1fcd812b
...
...
@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
void
Conv2dGeneral
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
index_t
*
filter_shape
,
const
int
*
stride_hw
,
const
int
*
dilation_hw
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
in_channels
*
in_image_size
;
const
index_t
out_batch_size
=
out_channels
*
out_image_size
;
const
index_t
filter_size
=
filter_
height
*
filter_width
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
filter_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
filter_shape
[
0
]
*
out_image_size
;
const
index_t
filter_size
=
filter_
shape
[
2
]
*
filter_shape
[
3
]
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
m
+=
4
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
b
++
)
{
for
(
index_t
m
=
0
;
m
<
filter_shape
[
0
];
m
+=
4
)
{
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
out_channels
=
filter_shape
[
0
];
const
index_t
in_channels
=
filter_shape
[
1
];
const
int
stride_h
=
stride_hw
[
0
];
const
int
stride_w
=
stride_hw
[
1
];
const
int
dilation_h
=
dilation_hw
[
0
];
const
int
dilation_w
=
dilation_hw
[
1
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
float
*
out_ptr1_base
=
output
+
b
*
out_batch_size
+
(
m
+
1
)
*
out_image_size
;
float
*
out_ptr2_base
=
output
+
b
*
out_batch_size
+
(
m
+
2
)
*
out_image_size
;
float
*
out_ptr3_base
=
output
+
b
*
out_batch_size
+
(
m
+
3
)
*
out_image_size
;
float
*
out_ptr1_base
=
out_ptr0_base
+
out_image_size
;
float
*
out_ptr2_base
=
out_ptr1_base
+
out_image_size
;
float
*
out_ptr3_base
=
out_ptr2_base
+
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr0
=
filter
+
m
*
in_channels
*
filter_size
+
c
*
filter_size
;
const
float
*
filter_ptr1
=
filter
+
(
m
+
1
)
*
in_channels
*
filter_size
+
c
*
filter_size
;
const
float
*
filter_ptr2
=
filter
+
(
m
+
2
)
*
in_channels
*
filter_size
+
c
*
filter_size
;
const
float
*
filter_ptr3
=
filter
+
(
m
+
3
)
*
in_channels
*
filter_size
+
c
*
filter_size
;
const
float
*
filter_ptr1
=
filter_ptr0
+
in_channels
*
filter_size
;
const
float
*
filter_ptr2
=
filter_ptr1
+
in_channels
*
filter_size
;
const
float
*
filter_ptr3
=
filter_ptr2
+
in_channels
*
filter_size
;
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
w
=
0
;
w
+
3
<
out_width
;
w
+=
4
)
{
// input offset
...
...
@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
vo3
[
ow
]
=
out_ptr3_base
[
out_offset
+
ow
];
}
// calc by row
for
(
index_t
kh
=
0
;
kh
<
filter_
height
;
++
kh
)
{
for
(
index_t
kw
=
0
;
kw
<
filter_
width
;
++
kw
)
{
for
(
index_t
kh
=
0
;
kh
<
filter_
shape
[
2
]
;
++
kh
)
{
for
(
index_t
kw
=
0
;
kw
<
filter_
shape
[
3
]
;
++
kw
)
{
// outch 0
vo0
[
0
]
+=
in_ptr_base
[
in_offset
+
kw
*
dilation_w
]
*
filter_ptr0
[
kw
];
...
...
@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
// kw
in_offset
+=
dilation_h
*
in_width
;
filter_ptr0
+=
filter_
width
;
filter_ptr1
+=
filter_
width
;
filter_ptr2
+=
filter_
width
;
filter_ptr3
+=
filter_
width
;
filter_ptr0
+=
filter_
shape
[
3
]
;
filter_ptr1
+=
filter_
shape
[
3
]
;
filter_ptr2
+=
filter_
shape
[
3
]
;
filter_ptr3
+=
filter_
shape
[
3
]
;
}
// kh
for
(
index_t
ow
=
0
;
ow
<
4
;
++
ow
)
{
...
...
@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
// calc by row
for
(
index_t
kh
=
0
;
kh
<
filter_
height
;
++
kh
)
{
for
(
index_t
kw
=
0
;
kw
<
filter_
width
;
++
kw
)
{
for
(
index_t
kh
=
0
;
kh
<
filter_
shape
[
2
]
;
++
kh
)
{
for
(
index_t
kw
=
0
;
kw
<
filter_
shape
[
3
]
;
++
kw
)
{
// outch 0
vo0
[
0
]
+=
in_ptr_base
[
in_offset
+
kw
*
dilation_w
]
*
filter_ptr0
[
kw
];
...
...
@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
// kw
in_offset
+=
dilation_h
*
in_width
;
filter_ptr0
+=
filter_
width
;
filter_ptr0
+=
filter_
shape
[
3
]
;
}
// kh
for
(
index_t
ow
=
0
;
ow
<
4
;
++
ow
)
{
...
...
@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
index_t
dilation_h
=
dilations_
[
0
];
index_t
dilation_w
=
dilations_
[
1
];
const
index_t
filter_hw
[
2
]
=
{
filter_h
,
filter_w
};
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
index_t
padded_input_height
=
input_height
+
paddings
[
0
];
...
...
@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
transformed_output
(
scratch_
->
Scratch
(
transformed_output_size
),
DT_FLOAT
);
Tensor
padded_input
(
scratch_
->
Scratch
(
padded_input_size
),
DT_FLOAT
);
Tensor
padded_output
(
scratch_
->
Scratch
(
padded_output_size
),
DT_FLOAT
);
const
index_t
extra_input_shape
[
4
]
=
{
batch
,
input_channels
,
extra_input_height
,
extra_input_width
};
const
index_t
extra_output_shape
[
4
]
=
{
batch
,
channels
,
extra_output_height
,
extra_output_width
};
// decide which convolution function to call
if
(
use_winograd
)
{
...
...
@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
float
*
transformed_input_data
=
transformed_input
.
mutable_data
<
float
>
();
float
*
transformed_output_data
=
transformed_output
.
mutable_data
<
float
>
();
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
WinoGradConv3x3s1
(
pad_input
,
transformed_filter_ptr
,
...
...
@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK3x3S1
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_3x3_s2
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK3x3S2
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_1x1_s1
)
{
...
...
@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK5x5S1
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_7x7_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK7x7S1
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_7x7_s2
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK7x7S2
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_7x7_s3
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK7x7S3
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dGeneral
(
pad_input
,
filter_data
,
batch
,
extra_input_height
,
extra_input_width
,
input_channels
,
extra_output_height
,
extra_output_width
,
channels
,
filter_h
,
filter_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
extra_input_shape
,
extra_output_shape
,
filter_shape
.
data
(),
strides_
,
dilations_
,
pad_output
);
};
}
...
...
mace/kernels/deconv_2d.h
浏览文件 @
1fcd812b
...
...
@@ -41,48 +41,40 @@ template<typename T>
void
Deconv2dNCHW
(
const
T
*
input
,
const
T
*
filter
,
const
T
*
bias
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
index_t
filter_height
,
const
index_t
filter_width
,
const
index_t
stride_h
,
const
index_t
stride_w
,
const
int
padding_top
,
const
int
padding_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
index_t
*
kernel_hw
,
const
int
*
strides
,
const
int
*
padding
,
float
*
output
)
{
#pragma omp parallel for collapse(4)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
out_
channels
;
++
oc
)
{
for
(
index_t
oh
=
0
;
oh
<
out_
height
;
++
oh
)
{
for
(
index_t
ow
=
0
;
ow
<
out_
width
;
++
ow
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
]
;
++
b
)
{
for
(
index_t
oc
=
0
;
oc
<
out_
shape
[
1
]
;
++
oc
)
{
for
(
index_t
oh
=
0
;
oh
<
out_
shape
[
2
]
;
++
oh
)
{
for
(
index_t
ow
=
0
;
ow
<
out_
shape
[
3
]
;
++
ow
)
{
index_t
filter_start_y
,
filter_start_x
;
index_t
start_x
=
std
::
max
<
int
>
(
0
,
ow
+
stride
_w
-
1
-
padding_left
);
index_t
start_y
=
std
::
max
<
int
>
(
0
,
oh
+
stride
_h
-
1
-
padding_top
);
start_x
/=
stride
_w
;
start_y
/=
stride
_h
;
filter_start_x
=
padding
_left
+
stride_w
*
start_x
-
ow
;
filter_start_y
=
padding
_top
+
stride_h
*
start_y
-
oh
;
filter_start_x
=
filter_width
-
1
-
filter_start_x
;
filter_start_y
=
filter_height
-
1
-
filter_start_y
;
index_t
start_x
=
std
::
max
<
int
>
(
0
,
ow
+
stride
s
[
1
]
-
1
-
padding
[
1
]
);
index_t
start_y
=
std
::
max
<
int
>
(
0
,
oh
+
stride
s
[
0
]
-
1
-
padding
[
0
]
);
start_x
/=
stride
s
[
1
]
;
start_y
/=
stride
s
[
0
]
;
filter_start_x
=
padding
[
1
]
+
strides
[
1
]
*
start_x
-
ow
;
filter_start_y
=
padding
[
0
]
+
strides
[
0
]
*
start_y
-
oh
;
filter_start_x
=
kernel_hw
[
1
]
-
1
-
filter_start_x
;
filter_start_y
=
kernel_hw
[
0
]
-
1
-
filter_start_y
;
T
out_value
=
0
;
index_t
out_pos
=
((
b
*
out_
channels
+
oc
)
*
out_height
+
oh
)
*
out_width
+
ow
;
for
(
index_t
ic
=
0
;
ic
<
in_
channels
;
++
ic
)
{
((
b
*
out_
shape
[
1
]
+
oc
)
*
out_shape
[
2
]
+
oh
)
*
out_shape
[
3
]
+
ow
;
for
(
index_t
ic
=
0
;
ic
<
in_
shape
[
1
]
;
++
ic
)
{
for
(
index_t
f_y
=
filter_start_y
,
ih
=
start_y
;
f_y
>=
0
&&
ih
<
in_
height
;
f_y
-=
stride_h
,
++
ih
)
{
f_y
>=
0
&&
ih
<
in_
shape
[
2
];
f_y
-=
strides
[
0
]
,
++
ih
)
{
for
(
index_t
f_x
=
filter_start_x
,
iw
=
start_x
;
f_x
>=
0
&&
iw
<
in_
width
;
f_x
-=
stride_w
,
++
iw
)
{
f_x
>=
0
&&
iw
<
in_
shape
[
3
];
f_x
-=
strides
[
1
]
,
++
iw
)
{
index_t
weight_pos
=
((
oc
*
in_
channels
+
ic
)
*
filter_height
+
f_y
)
*
filter_width
+
f_x
;
((
oc
*
in_
shape
[
1
]
+
ic
)
*
kernel_hw
[
0
]
+
f_y
)
*
kernel_hw
[
1
]
+
f_x
;
index_t
in_pos
=
((
b
*
in_
channels
+
ic
)
*
in_height
+
ih
)
*
in_
width
+
iw
;
((
b
*
in_
shape
[
1
]
+
ic
)
*
in_shape
[
2
]
+
ih
)
*
in_
shape
[
3
]
+
iw
;
out_value
+=
input
[
in_pos
]
*
filter
[
weight_pos
];
}
}
...
...
@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
paddings_
.
data
(),
true
);
output
->
Resize
(
output_shape_
);
}
index_t
batch
=
output
->
dim
(
0
);
index_t
channels
=
output
->
dim
(
1
);
index_t
height
=
output
->
dim
(
2
);
index_t
width
=
output
->
dim
(
3
);
index_t
input_batch
=
input
->
dim
(
0
);
index_t
input_channels
=
input
->
dim
(
1
);
index_t
input_height
=
input
->
dim
(
2
);
index_t
input_width
=
input
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
MACE_CHECK
(
filter
->
dim
(
0
)
==
channels
,
filter
->
dim
(
0
),
" != "
,
channels
);
MACE_CHECK
(
filter
->
dim
(
1
)
==
input_channels
,
filter
->
dim
(
1
),
" != "
,
input_channels
);
index_t
stride_h
=
strides_
[
0
];
index_t
stride_w
=
strides_
[
1
];
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
const
index_t
*
in_shape
=
input
->
shape
().
data
();
const
index_t
*
out_shape
=
output
->
shape
().
data
();
const
index_t
kernel_hw
[
2
]
=
{
kernel_h
,
kernel_w
};
MACE_CHECK
(
filter
->
dim
(
0
)
==
out_shape
[
1
],
filter
->
dim
(
0
),
" != "
,
output_shape
[
1
]);
MACE_CHECK
(
filter
->
dim
(
1
)
==
in_shape
[
1
],
filter
->
dim
(
1
),
" != "
,
in_shape
[
1
]);
MACE_CHECK
(
in_shape
[
0
]
==
out_shape
[
0
],
"Input/Output batch size mismatch"
);
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
Tensor
::
MappingGuard
bias_mapper
(
bias
);
...
...
@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
auto
filter_data
=
filter
->
data
<
T
>
();
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
auto
output_data
=
output
->
mutable_data
<
T
>
();
int
padding_top
=
(
paddings_
[
0
]
+
1
)
>>
1
;
int
padding_left
=
(
paddings_
[
1
]
+
1
)
>>
1
;
deconv
::
Deconv2dNCHW
(
input_data
,
filter_data
,
bias_data
,
batch
,
input_height
,
input_width
,
input_channels
,
height
,
width
,
channels
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
padding_top
,
padding_left
,
int
padding
[
2
];
padding
[
0
]
=
(
paddings_
[
0
]
+
1
)
>>
1
;
padding
[
1
]
=
(
paddings_
[
1
]
+
1
)
>>
1
;
deconv
::
Deconv2dNCHW
(
input_data
,
filter_data
,
bias_data
,
in_shape
,
out_shape
,
kernel_hw
,
strides_
,
padding
,
output_data
);
DoActivation
(
output_data
,
output_data
,
output
->
size
(),
activation_
,
DoActivation
(
output_data
,
output_data
,
output
->
size
(),
activation_
,
relux_max_limit_
);
}
};
...
...
mace/kernels/depth_to_space.h
浏览文件 @
1fcd812b
...
...
@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
:
block_size_
(
block_size
),
d2s_
(
d2s
)
{}
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
const
int
batch_size
=
input
->
dim
(
0
);
const
int
input_depth
=
input
->
dim
(
1
);
const
int
input_height
=
input
->
dim
(
2
);
const
int
input_width
=
input
->
dim
(
3
);
const
in
dex_
t
batch_size
=
input
->
dim
(
0
);
const
in
dex_
t
input_depth
=
input
->
dim
(
1
);
const
in
dex_
t
input_height
=
input
->
dim
(
2
);
const
in
dex_
t
input_width
=
input
->
dim
(
3
);
index_t
output_depth
,
output_width
,
output_height
;
...
...
@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {
if
(
d2s_
)
{
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
int
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
const
int
in_h
=
h
/
block_size_
;
const
int
offset_h
=
(
h
%
block_size_
);
for
(
in
dex_
t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
in
dex_
t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
in
dex_
t
h
=
0
;
h
<
output_height
;
++
h
)
{
const
in
dex_
t
in_h
=
h
/
block_size_
;
const
in
dex_
t
offset_h
=
(
h
%
block_size_
);
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
const
index_t
in_w
=
w
/
block_size_
;
const
index_t
offset_w
=
w
%
block_size_
;
...
...
@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
}
}
else
{
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
int
d
=
0
;
d
<
input_depth
;
++
d
)
{
for
(
int
h
=
0
;
h
<
input_height
;
++
h
)
{
const
int
out_h
=
h
/
block_size_
;
const
int
offset_h
=
(
h
%
block_size_
);
for
(
int
w
=
0
;
w
<
input_width
;
++
w
)
{
const
int
out_w
=
w
/
block_size_
;
const
int
offset_w
=
(
w
%
block_size_
);
const
int
offset_d
=
for
(
in
dex_
t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
in
dex_
t
d
=
0
;
d
<
input_depth
;
++
d
)
{
for
(
in
dex_
t
h
=
0
;
h
<
input_height
;
++
h
)
{
const
in
dex_
t
out_h
=
h
/
block_size_
;
const
in
dex_
t
offset_h
=
(
h
%
block_size_
);
for
(
in
dex_
t
w
=
0
;
w
<
input_width
;
++
w
)
{
const
in
dex_
t
out_w
=
w
/
block_size_
;
const
in
dex_
t
offset_w
=
(
w
%
block_size_
);
const
in
dex_
t
offset_d
=
(
offset_h
*
block_size_
+
offset_w
)
*
input_depth
;
const
int
out_d
=
d
+
offset_d
;
const
in
dex_
t
out_d
=
d
+
offset_d
;
const
index_t
o_index
=
((
b
*
output_depth
+
out_d
)
*
output_height
+
out_h
)
*
output_width
+
out_w
;
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
1fcd812b
...
...
@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
void
DepthwiseConv2dGeneral
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
out_channels
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
index_t
*
filter_shape
,
const
int
*
stride_hw
,
const
int
*
dilation_hw
,
const
int
*
pad_hw
,
float
*
output
)
{
const
index_t
multiplier
=
out_channels
/
in_channels
;
const
index_t
multiplier
=
filter_shape
[
0
]
/
filter_shape
[
1
]
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
for
(
index_t
b
=
0
;
b
<
in_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
filter_shape
[
0
];
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
2
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
3
];
++
w
)
{
const
index_t
out_channels
=
filter_shape
[
0
];
const
index_t
in_channels
=
filter_shape
[
1
];
const
index_t
filter_height
=
filter_shape
[
2
];
const
index_t
filter_width
=
filter_shape
[
3
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
index_t
out_offset
=
((
b
*
out_channels
+
m
)
*
out_height
+
h
)
*
out_width
+
w
;
index_t
c
=
m
/
multiplier
;
...
...
@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
float
sum
=
0
;
for
(
index_t
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
index_t
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
index_t
ih
=
h
*
stride_h
+
kh
*
dilation_h
-
pad_top
;
index_t
iw
=
w
*
stride_
w
+
kw
*
dilation_w
-
pad_left
;
index_t
ih
=
h
*
stride_h
w
[
0
]
+
kh
*
dilation_hw
[
0
]
-
pad_hw
[
0
]
;
index_t
iw
=
w
*
stride_
hw
[
1
]
+
kw
*
dilation_hw
[
1
]
-
pad_hw
[
1
]
;
if
(
ih
>=
0
&&
ih
<
in_height
&&
iw
>=
0
&&
iw
<
in_width
)
{
index_t
in_offset
=
((
b
*
in_channels
+
c
)
*
in_height
+
ih
)
*
in_width
+
iw
;
...
...
@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
float
>
();
auto
output_data
=
output
->
mutable_data
<
float
>
();
const
int
pad_hw
[
2
]
=
{
pad_top
,
pad_left
};
const
index_t
input_shape
[
4
]
=
{
batch
,
input_channels
,
input_height
,
input_width
};
if
(
filter_h
==
3
&&
filter_w
==
3
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
)
{
conv_func
=
[
=
](
const
float
*
input
,
float
*
output
)
{
DepthwiseConv2dNeonK3x3S1
(
input
,
filter_data
,
batch
,
input_height
,
input_width
,
input_channels
,
height
,
width
,
channels
,
pad_top
,
pad_left
,
input_shape
,
output_shape
.
data
(),
pad_hw
,
valid_h_start
,
valid_h_stop
,
valid_w_start
,
...
...
@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func
=
[
=
](
const
float
*
input
,
float
*
output
)
{
DepthwiseConv2dNeonK3x3S2
(
input
,
filter_data
,
batch
,
input_height
,
input_width
,
input_channels
,
height
,
width
,
channels
,
pad_top
,
pad_left
,
input_shape
,
output_shape
.
data
(),
pad_hw
,
valid_h_start
,
valid_h_stop
,
valid_w_start
,
...
...
@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func
=
[
=
](
const
float
*
input
,
float
*
output
)
{
DepthwiseConv2dGeneral
(
input
,
filter_data
,
batch
,
input_height
,
input_width
,
input_channels
,
height
,
width
,
channels
,
filter_h
,
filter_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
pad_top
,
pad_left
,
input_shape
,
output_shape
.
data
(),
filter_shape
.
data
(),
strides_
,
dilations_
,
pad_hw
,
output
);
};
}
...
...
mace/kernels/image_to_buffer.h
浏览文件 @
1fcd812b
...
...
@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
const
BufferType
type
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
input
);
MACE_UNUSED
(
type
);
MACE_UNUSED
(
output
);
MACE_UNUSED
(
future
);
MACE_NOT_IMPLEMENTED
;
}
};
...
...
mace/kernels/opencl/bias_add.cc
浏览文件 @
1fcd812b
...
...
@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
if
(
lws
[
i
]
!=
0
)
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
mace/kernels/pooling.h
浏览文件 @
1fcd812b
...
...
@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
}
void
MaxPooling
(
const
float
*
input
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
filter_hw
,
const
int
*
stride_hw
,
const
int
*
dilation_hw
,
const
int
*
pad_hw
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
channels
*
in_image_size
;
const
index_t
out_batch_size
=
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
]
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
]
;
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
const
index_t
in_base
=
b
*
in_batch_size
+
c
*
in_image_size
;
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
const
index_t
out_offset
=
out_base
+
h
*
out_width
+
w
;
float
res
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
fh
=
0
;
fh
<
filter_height
;
++
fh
)
{
for
(
int
fw
=
0
;
fw
<
filter_width
;
++
fw
)
{
int
inh
=
h
*
stride_h
+
dilation_h
*
fh
-
pad_top
;
int
inw
=
w
*
stride_w
+
dilation_w
*
fw
-
pad_left
;
for
(
int
fh
=
0
;
fh
<
filter_hw
[
0
];
++
fh
)
{
for
(
int
fw
=
0
;
fw
<
filter_hw
[
1
];
++
fw
)
{
index_t
inh
=
h
*
stride_hw
[
0
]
+
dilation_hw
[
0
]
*
fh
-
pad_hw
[
0
];
index_t
inw
=
w
*
stride_hw
[
1
]
+
dilation_hw
[
1
]
*
fw
-
pad_hw
[
1
];
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
index_t
input_offset
=
in_base
+
inh
*
in_width
+
inw
;
res
=
std
::
max
(
res
,
input
[
input_offset
]);
...
...
@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
}
void
AvgPooling
(
const
float
*
input
,
const
index_t
batch
,
const
index_t
in_height
,
const
index_t
in_width
,
const
index_t
channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
pad_top
,
const
int
pad_left
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
filter_hw
,
const
int
*
stride_hw
,
const
int
*
dilation_hw
,
const
int
*
pad_hw
,
float
*
output
)
{
const
index_t
in_image_size
=
in_
height
*
in_width
;
const
index_t
out_image_size
=
out_
height
*
out_width
;
const
index_t
in_batch_size
=
channels
*
in_image_size
;
const
index_t
out_batch_size
=
channels
*
out_image_size
;
const
index_t
in_image_size
=
in_
shape
[
2
]
*
in_shape
[
3
]
;
const
index_t
out_image_size
=
out_
shape
[
2
]
*
out_shape
[
3
]
;
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
#pragma omp parallel for collapse(2)
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
]
;
++
b
)
{
for
(
index_t
c
=
0
;
c
<
out_shape
[
1
]
;
++
c
)
{
const
index_t
out_base
=
b
*
out_batch_size
+
c
*
out_image_size
;
const
index_t
in_base
=
b
*
in_batch_size
+
c
*
in_image_size
;
const
index_t
in_height
=
in_shape
[
2
];
const
index_t
in_width
=
in_shape
[
3
];
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
const
index_t
out_offset
=
out_base
+
h
*
out_width
+
w
;
float
res
=
0
;
int
block_size
=
0
;
for
(
int
fh
=
0
;
fh
<
filter_height
;
++
fh
)
{
for
(
int
fw
=
0
;
fw
<
filter_width
;
++
fw
)
{
int
inh
=
h
*
stride_h
+
dilation_h
*
fh
-
pad_top
;
int
inw
=
w
*
stride_w
+
dilation_w
*
fw
-
pad_left
;
for
(
int
fh
=
0
;
fh
<
filter_hw
[
0
];
++
fh
)
{
for
(
int
fw
=
0
;
fw
<
filter_hw
[
1
];
++
fw
)
{
index_t
inh
=
h
*
stride_hw
[
0
]
+
dilation_hw
[
0
]
*
fh
-
pad_hw
[
0
];
index_t
inw
=
w
*
stride_hw
[
1
]
+
dilation_hw
[
1
]
*
fw
-
pad_hw
[
1
];
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
index_t
input_offset
=
in_base
+
inh
*
in_width
+
inw
;
res
+=
input
[
input_offset
];
...
...
@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
const
index_t
*
input_shape
=
input_tensor
->
shape
().
data
();
index_t
batch
=
output_shape
[
0
];
index_t
channels
=
output_shape
[
1
];
index_t
height
=
output_shape
[
2
];
index_t
width
=
output_shape
[
3
];
index_t
input_height
=
input_shape
[
2
];
index_t
input_width
=
input_shape
[
3
];
int
filter_h
=
kernels_
[
0
];
int
filter_w
=
kernels_
[
1
];
int
stride_h
=
strides_
[
0
];
int
stride_w
=
strides_
[
1
];
int
dilation_h
=
dilations_
[
0
];
int
dilation_w
=
dilations_
[
1
];
int
pad_top
=
paddings
[
0
]
/
2
;
int
pad_left
=
paddings
[
1
]
/
2
;
int
pad_hw
[
2
]
=
{
paddings
[
0
]
/
2
,
paddings
[
1
]
/
2
};
if
(
pooling_type_
==
PoolingType
::
MAX
)
{
MaxPooling
(
input
,
batch
,
input_height
,
input_width
,
channels
,
height
,
width
,
filter_h
,
filter_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
pad_top
,
pad_left
,
input_shape
,
output_shape
.
data
(),
kernels_
,
strides_
,
dilations_
,
pad_hw
,
output
);
}
else
if
(
pooling_type_
==
PoolingType
::
AVG
)
{
AvgPooling
(
input
,
batch
,
input_height
,
input_width
,
channels
,
height
,
width
,
filter_h
,
filter_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
pad_top
,
pad_left
,
input_shape
,
output_shape
.
data
(),
kernels_
,
strides_
,
dilations_
,
pad_hw
,
output
);
}
else
{
MACE_NOT_IMPLEMENTED
;
...
...
mace/kernels/winograd_transform.h
浏览文件 @
1fcd812b
...
...
@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
MACE_UNUSED
(
input
);
MACE_UNUSED
(
bias
);
MACE_UNUSED
(
output
);
MACE_UNUSED
(
future
);
MACE_NOT_IMPLEMENTED
;
}
};
...
...
mace/ops/activation.h
浏览文件 @
1fcd812b
...
...
@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
const
Tensor
*
input_tensor
=
this
->
Input
(
0
);
const
Tensor
*
alpha_tensor
=
this
->
InputSize
()
>=
2
?
this
->
Input
(
1
)
:
nullptr
;
Tensor
*
output_tensor
=
this
->
outputs_
[
0
]
;
Tensor
*
output_tensor
=
this
->
Output
(
0
)
;
output_tensor
->
ResizeLike
(
input_tensor
);
functor_
(
input_tensor
,
alpha_tensor
,
output_tensor
,
future
);
...
...
mace/ops/ops_test_util.h
浏览文件 @
1fcd812b
...
...
@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
rel_err
,
const
double
abs_err
)
{
MACE_UNUSED
(
rel_err
);
MACE_UNUSED
(
abs_err
);
Equal
(
x
,
y
);
}
};
...
...
mace/utils/logging.cc
浏览文件 @
1fcd812b
...
...
@@ -15,6 +15,7 @@
#include "mace/utils/logging.h"
#include <stdlib.h>
#include <string.h>
#if defined(ANDROID) || defined(__ANDROID__)
#include <android/log.h>
#include <iostream>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录