Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
dd501eea
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
dd501eea
编写于
2月 11, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix boundary checking and tuning performance
上级
aba88a23
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
190 addition
and
69 deletion
+190
-69
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+173
-54
mace/ops/depthwise_conv2d_benchmark.cc
mace/ops/depthwise_conv2d_benchmark.cc
+15
-13
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+2
-2
未找到文件。
mace/kernels/depthwise_conv2d.h
浏览文件 @
dd501eea
...
...
@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_DEPTHWISE_CONV2D_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/common.h"
#include "mace/core/future.h"
#include "mace/core/public/mace.h"
...
...
@@ -64,7 +68,9 @@ void DepthwiseConv2dKernel(const T *input_ptr,
inw
>=
input_width
)
{
MACE_CHECK
(
inh
>=
padded_h_start
&&
inh
<
padded_h_stop
&&
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
"Out of range read from input: "
,
inh
,
", "
,
inw
);
"Out of range read from input: "
,
padded_h_start
,
" <= "
,
inh
,
" < "
,
padded_h_stop
,
", "
,
padded_w_start
,
" <= "
,
inw
,
" < "
,
padded_w_stop
);
}
else
{
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
...
...
@@ -108,33 +114,120 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
int
h_stop
,
int
w_start
,
int
w_stop
)
{
if
(
multiplier
==
1
)
{
constexpr
int
c_tile_size
=
4
;
#pragma omp parallel for collapse(3)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
h_start
;
h
<
h_stop
;
++
h
)
{
for
(
int
w
=
w_start
;
w
<
w_stop
;
++
w
)
{
int
c
;
for
(
c
=
0
;
c
+
c_tile_size
<=
channels
;
c
+=
c_tile_size
)
{
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert
(
c_tile_size
==
4
,
"channels tile size must be 4"
);
float32x4_t
sum
=
vdupq_n_f32
(
0
);
if
(
bias_ptr
!=
nullptr
)
{
sum
=
vld1q_f32
(
bias_ptr
+
c
);
}
#else
T
sum
[
c_tile_size
]
=
{
0
};
if
(
bias_ptr
!=
nullptr
)
{
for
(
int
ci
=
0
;
ci
<
c_tile_size
;
++
ci
)
{
sum
[
ci
]
=
bias_ptr
[
c
+
ci
];
}
}
#endif
const
T
*
filter_base
=
filter_ptr
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
MACE_ASSERT
(
inh
>=
0
&&
inh
<
input_height
&&
inw
>=
0
&&
inw
<
input_width
);
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
c
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t
in
=
vld1q_f32
(
input_ptr
+
input_offset
);
float32x4_t
weights
=
vld1q_f32
(
filter_base
);
sum
=
vfmaq_f32
(
sum
,
in
,
weights
);
#else
for
(
int
ci
=
0
;
ci
<
c_tile_size
;
++
ci
)
{
sum
[
ci
]
+=
input_ptr
[
input_offset
+
ci
]
*
filter_base
[
ci
];
// HWIM
}
#endif
filter_base
+=
input_channels
;
}
}
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
vst1q_f32
(
output_ptr
+
offset
,
sum
);
#else
for
(
int
ci
=
0
;
ci
<
c_tile_size
;
++
ci
)
{
output_ptr
[
offset
+
ci
]
=
sum
[
ci
];
}
#endif
}
for
(;
c
<
channels
;
++
c
)
{
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
MACE_ASSERT
(
inh
>=
0
&&
inh
<
input_height
&&
inw
>=
0
&&
inw
<
input_width
);
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
c
;
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
filter_base
+=
input_channels
*
multiplier
;
}
}
output_ptr
[
offset
]
+=
sum
;
}
}
}
}
}
else
{
#pragma omp parallel for collapse(4)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
h_start
;
h
<
h_stop
;
++
h
)
{
for
(
int
w
=
w_start
;
w
<
w_stop
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
inc
=
c
/
multiplier
;
const
index_t
m
=
c
%
multiplier
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
// TODO vectorize this
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
filter_base
+=
input_channels
*
multiplier
;
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
h_start
;
h
<
h_stop
;
++
h
)
{
for
(
int
w
=
w_start
;
w
<
w_stop
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
inc
=
c
/
multiplier
;
const
index_t
m
=
c
%
multiplier
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
MACE_ASSERT
(
inh
>=
0
&&
inh
<
input_height
&&
inw
>=
0
&&
inw
<
input_width
);
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
filter_base
+=
input_channels
*
multiplier
;
}
}
output_ptr
[
offset
]
+=
sum
;
}
output_ptr
[
offset
]
+=
sum
;
}
}
}
...
...
@@ -230,10 +323,15 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
// The left-upper most offset of the padded input
int
padded_h_start
=
0
-
paddings
[
0
]
/
2
;
int
padded_w_start
=
0
-
paddings
[
1
]
/
2
;
index_t
padded_h_stop
=
input_height
+
paddings
[
0
]
-
paddings
[
0
]
/
2
;
index_t
padded_w_stop
=
input_width
+
paddings
[
1
]
-
paddings
[
1
]
/
2
;
int
paddings_top
=
paddings
[
0
]
/
2
;
int
paddings_bottom
=
paddings
[
0
]
-
paddings_top
;
int
paddings_left
=
paddings
[
1
]
/
2
;
int
paddings_right
=
paddings
[
1
]
-
paddings_left
;
int
padded_h_start
=
0
-
paddings_top
;
int
padded_w_start
=
0
-
paddings_left
;
index_t
padded_h_stop
=
input_height
+
paddings_bottom
;
index_t
padded_w_stop
=
input_width
+
paddings_right
;
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
...
...
@@ -244,38 +342,59 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
const
T
*
bias_ptr
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
int
valid_h_start
=
paddings_top
==
0
?
0
:
(
paddings_top
-
1
)
/
stride_h
+
1
;
int
valid_h_stop
=
paddings_bottom
==
0
?
height
:
height
-
((
paddings_bottom
-
1
)
/
stride_h
+
1
);
int
valid_w_start
=
paddings_left
==
0
?
0
:
(
paddings_left
-
1
)
/
stride_w
+
1
;
int
valid_w_stop
=
paddings_right
==
0
?
width
:
width
-
((
paddings_right
-
1
)
/
stride_w
+
1
);
// Calculate border elements with out-of-boundary checking
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
0
,
1
,
0
,
width
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
height
-
1
,
height
,
0
,
width
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
0
,
1
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
width
-
1
,
width
);
if
(
valid_h_start
>
0
)
{
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
0
,
valid_h_start
,
0
,
width
);
}
if
(
valid_h_stop
<
height
)
{
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
std
::
max
(
valid_h_start
,
valid_h_stop
),
height
,
0
,
width
);
}
if
(
valid_w_start
>
0
)
{
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
valid_h_start
,
valid_h_stop
,
0
,
valid_w_start
);
}
if
(
valid_w_stop
<
width
)
{
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
valid_h_start
,
valid_h_stop
,
std
::
max
(
valid_w_start
,
valid_w_stop
),
width
);
}
// Calculate border elements without out-of-boundary checking
DepthwiseConv2dNoOOBCheckKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
1
,
width
-
1
);
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
valid_h_start
,
valid_h_stop
,
valid_w_start
,
valid_w_stop
);
output_ptr
=
output
->
mutable_data
<
T
>
();
DoActivation
(
output_ptr
,
output_ptr
,
output
->
NumElements
(),
activation_
,
...
...
mace/ops/depthwise_conv
_
2d_benchmark.cc
→
mace/ops/depthwise_conv2d_benchmark.cc
浏览文件 @
dd501eea
...
...
@@ -75,25 +75,27 @@ static void DepthwiseConv2d(int iters,
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,
\
DEVICE)
\
static void
\
BM_DEPTHWISE_
CONV_
2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::ItemsProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,
\
mace::Padding::P, OC);
\
}
\
BENCHMARK(
\
BM_DEPTHWISE_
CONV_
2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
dd501eea
...
...
@@ -280,10 +280,10 @@ void TestNxNS12(const index_t height, const index_t width) {
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
0.1
);
};
for
(
int
kernel_size
:
{
3
})
{
for
(
int
kernel_size
:
{
2
,
3
,
4
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
//
func(kernel_size, kernel_size, stride, stride, SAME);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录