Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
780f5a60
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
780f5a60
编写于
2月 08, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor conv2d NEON implementations
上级
07f8ff18
变更
52
隐藏空白更改
内联
并排
Showing
52 changed file
with
904 addition
and
1549 deletion
+904
-1549
mace/core/tensor.h
mace/core/tensor.h
+2
-1
mace/kernels/BUILD
mace/kernels/BUILD
+4
-5
mace/kernels/addn.h
mace/kernels/addn.h
+6
-3
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+3
-3
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+518
-49
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+88
-41
mace/kernels/conv_pool_2d_util.h
mace/kernels/conv_pool_2d_util.h
+6
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+2
-2
mace/kernels/neon/addn_neon.cc
mace/kernels/neon/addn_neon.cc
+3
-1
mace/kernels/neon/avg_pooling_neon_3x3.cc
mace/kernels/neon/avg_pooling_neon_3x3.cc
+0
-224
mace/kernels/neon/batch_norm_neon.cc
mace/kernels/neon/batch_norm_neon.cc
+30
-18
mace/kernels/neon/global_avg_pooling_neon.cc
mace/kernels/neon/global_avg_pooling_neon.cc
+0
-56
mace/kernels/neon/max_pooling_neon_2x2.cc
mace/kernels/neon/max_pooling_neon_2x2.cc
+0
-173
mace/kernels/neon/max_pooling_neon_3x3.cc
mace/kernels/neon/max_pooling_neon_3x3.cc
+0
-220
mace/kernels/neon/pooling_neon.cc
mace/kernels/neon/pooling_neon.cc
+0
-131
mace/kernels/neon/relu_neon.cc
mace/kernels/neon/relu_neon.cc
+0
-70
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+25
-20
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+2
-2
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+5
-4
mace/ops/activation.cc
mace/ops/activation.cc
+0
-8
mace/ops/activation_benchmark.cc
mace/ops/activation_benchmark.cc
+10
-9
mace/ops/activation_test.cc
mace/ops/activation_test.cc
+0
-36
mace/ops/addn_benchmark.cc
mace/ops/addn_benchmark.cc
+11
-11
mace/ops/addn_test.cc
mace/ops/addn_test.cc
+0
-6
mace/ops/batch_norm_benchmark.cc
mace/ops/batch_norm_benchmark.cc
+20
-17
mace/ops/batch_norm_test.cc
mace/ops/batch_norm_test.cc
+8
-16
mace/ops/batch_to_space_benchmark.cc
mace/ops/batch_to_space_benchmark.cc
+6
-6
mace/ops/bias_add.cc
mace/ops/bias_add.cc
+0
-10
mace/ops/bias_add_benchmark.cc
mace/ops/bias_add_benchmark.cc
+18
-17
mace/ops/buffer_to_image.cc
mace/ops/buffer_to_image.cc
+1
-1
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+2
-2
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+0
-14
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+44
-48
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+12
-164
mace/ops/depthwise_conv2d.cc
mace/ops/depthwise_conv2d.cc
+0
-8
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+0
-19
mace/ops/depthwise_conv_2d_benchmark.cc
mace/ops/depthwise_conv_2d_benchmark.cc
+16
-15
mace/ops/folded_batch_norm.cc
mace/ops/folded_batch_norm.cc
+0
-8
mace/ops/fused_conv_2d.cc
mace/ops/fused_conv_2d.cc
+0
-6
mace/ops/fused_conv_2d_test.cc
mace/ops/fused_conv_2d_test.cc
+6
-6
mace/ops/global_avg_pooling.cc
mace/ops/global_avg_pooling.cc
+0
-8
mace/ops/global_avg_pooling_test.cc
mace/ops/global_avg_pooling_test.cc
+0
-26
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+6
-6
mace/ops/pooling.cc
mace/ops/pooling.cc
+0
-8
mace/ops/resize_bilinear.cc
mace/ops/resize_bilinear.cc
+0
-8
mace/ops/resize_bilinear_benchmark.cc
mace/ops/resize_bilinear_benchmark.cc
+12
-12
mace/ops/softmax_benchmark.cc
mace/ops/softmax_benchmark.cc
+9
-8
mace/ops/space_to_batch_benchmark.cc
mace/ops/space_to_batch_benchmark.cc
+6
-6
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+4
-4
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+11
-11
mace/utils/utils.h
mace/utils/utils.h
+5
-0
tools/bazel-adb-run.sh
tools/bazel-adb-run.sh
+3
-2
未找到文件。
mace/core/tensor.h
浏览文件 @
780f5a60
...
...
@@ -288,7 +288,8 @@ class Tensor {
}
CASES
(
dtype_
,
(
os
<<
(
this
->
data
<
T
>
()[
i
])
<<
", "
));
}
LOG
(
INFO
)
<<
os
.
str
();
LOG
(
INFO
)
<<
"Tensor size: ["
<<
dim
(
0
)
<<
", "
<<
dim
(
1
)
<<
", "
<<
dim
(
2
)
<<
", "
<<
dim
(
3
)
<<
"], content:
\n
"
<<
os
.
str
();
}
inline
size_t
SizeOfType
()
const
{
...
...
mace/kernels/BUILD
浏览文件 @
780f5a60
...
...
@@ -15,15 +15,14 @@ cc_library(
"*.cc"
,
"opencl/*.cc"
,
])
+
if_neon_enabled
(
glob
([
"neon/*.cc"
,
"neon/addn_neon.cc"
,
"neon/batch_norm_neon.cc"
,
])),
hdrs
=
glob
([
"*.h"
,
"opencl/*.h"
,
])
+
if_neon_enabled
(
glob
([
"neon/*.h"
,
])),
copts
=
if_openmp_enabled
([
"-fopenmp"
]),
]),
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
]),
linkopts
=
if_android
([
"-lm"
]),
deps
=
[
"//mace/core"
,
...
...
mace/kernels/addn.h
浏览文件 @
780f5a60
...
...
@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
...
...
@@ -12,7 +16,6 @@
namespace
mace
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
struct
AddNFunctor
{
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
...
...
@@ -47,7 +50,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
};
}
//
namespace kernels
}
//
namespace mace
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_ADDN_H_
mace/kernels/batch_norm.h
浏览文件 @
780f5a60
...
...
@@ -136,7 +136,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
cl
::
Kernel
kernel_
;
};
}
//
namepsace kernels
}
//
namespace mace
}
// namepsace kernels
}
// namespace mace
#endif //
MACE_KERNELS_BATCH_NORM_H_
#endif // MACE_KERNELS_BATCH_NORM_H_
mace/kernels/conv_2d.h
浏览文件 @
780f5a60
...
...
@@ -5,14 +5,176 @@
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/
core/runtime/opencl/cl2_header
.h"
#include "mace/
utils/utils
.h"
namespace
mace
{
namespace
kernels
{
namespace
{
template
<
typename
T
,
int
inc_tile_size
,
int
c_count
,
int
h_count
,
int
w_count
>
void
Conv2dKernelFunc
(
const
T
*
input_ptr
,
// batch start
const
T
*
filter_ptr
,
const
T
*
bias_ptr
,
T
*
output_ptr
,
// batch start
const
int
h_offset
,
const
int
w_offset
,
const
int
c_offset
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channels
,
const
int
input_channels
,
const
int
width
,
const
int
padded_width
)
{
T
sum
[
h_count
*
w_count
*
c_count
]
=
{
0.0
f
};
if
(
bias_ptr
!=
nullptr
)
{
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
sum_idx
=
(
hi
*
w_count
+
wi
)
*
c_count
+
ci
;
sum
[
sum_idx
]
=
bias_ptr
[
c_offset
+
ci
];
}
}
}
}
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inc
=
0
;
for
(;
inc
+
inc_tile_size
<=
input_channels
;
inc
+=
inc_tile_size
)
{
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// AArch64 NEON has 32 128-bit general purpose registers
static_assert
(
inc_tile_size
==
4
,
"input channels tile size must be 4"
);
float32x4_t
in
[
h_count
*
w_count
];
#else
T
in
[
h_count
*
w_count
*
inc_tile_size
];
#endif
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
const
int
in_idx
=
hi
*
w_count
+
wi
;
const
int
inh
=
(
h_offset
+
hi
)
*
stride_h
+
kh
*
dilation_h
;
const
int
inw
=
(
w_offset
+
wi
)
*
stride_w
+
kw
*
dilation_w
;
const
int
in_offset
=
(
inh
*
padded_width
+
inw
)
*
input_channels
+
inc
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert
(
inc_tile_size
==
4
,
"input channels tile size must be 4"
);
in
[
in_idx
]
=
vld1q_f32
(
input_ptr
+
in_offset
);
#else
for
(
int
inci
=
0
;
inci
<
inc_tile_size
;
++
inci
)
{
in
[
in_idx
*
inc_tile_size
+
inci
]
=
input_ptr
[
in_offset
+
inci
];
}
#endif
}
}
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert
(
inc_tile_size
==
4
,
"input channels tile size must be 4"
);
float32x4_t
weights
[
c_count
];
#else
T
weights
[
c_count
*
inc_tile_size
];
#endif
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
weights_idx
=
ci
;
const
int
filter_offset
=
((
kh
*
kernel_w
+
kw
)
*
channels
+
c_offset
+
ci
)
*
input_channels
+
inc
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
weights
[
weights_idx
]
=
vld1q_f32
(
filter_ptr
+
filter_offset
);
#else
for
(
int
inci
=
0
;
inci
<
inc_tile_size
;
++
inci
)
{
weights
[
weights_idx
*
inc_tile_size
+
inci
]
=
filter_ptr
[
filter_offset
+
inci
];
}
#endif
}
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
weights_idx
=
ci
;
const
int
in_idx
=
hi
*
w_count
+
wi
;
const
int
sum_idx
=
(
hi
*
w_count
+
wi
)
*
c_count
+
ci
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t
tmp
=
vmulq_f32
(
in
[
in_idx
],
weights
[
weights_idx
]);
sum
[
sum_idx
]
+=
vaddvq_f32
(
tmp
);
#else
for
(
int
inci
=
0
;
inci
<
inc_tile_size
;
++
inci
)
{
sum
[
sum_idx
]
+=
in
[
in_idx
*
inc_tile_size
+
inci
]
*
weights
[
weights_idx
*
inc_tile_size
+
inci
];
}
#endif
}
}
}
}
// handling the remaining input channels
for
(;
inc
<
input_channels
;
++
inc
)
{
T
in
[
h_count
*
w_count
];
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
const
int
in_idx
=
hi
*
w_count
+
wi
;
const
int
inh
=
(
h_offset
+
hi
)
*
stride_h
+
kh
*
dilation_h
;
const
int
inw
=
(
w_offset
+
wi
)
*
stride_w
+
kw
*
dilation_w
;
const
int
in_offset
=
(
inh
*
padded_width
+
inw
)
*
input_channels
+
inc
;
in
[
in_idx
]
=
input_ptr
[
in_offset
];
}
}
T
weights
[
c_count
];
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
weights_idx
=
ci
;
const
int
filter_offset
=
((
kh
*
kernel_w
+
kw
)
*
channels
+
c_offset
+
ci
)
*
input_channels
+
inc
;
weights
[
weights_idx
]
=
filter_ptr
[
filter_offset
];
}
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
weights_idx
=
ci
;
const
int
in_idx
=
hi
*
w_count
+
wi
;
const
int
sum_idx
=
(
hi
*
w_count
+
wi
)
*
c_count
+
ci
;
sum
[
sum_idx
]
+=
in
[
in_idx
]
*
weights
[
weights_idx
];
}
}
}
}
}
}
// save output
for
(
int
hi
=
0
;
hi
<
h_count
;
++
hi
)
{
for
(
int
wi
=
0
;
wi
<
w_count
;
++
wi
)
{
for
(
int
ci
=
0
;
ci
<
c_count
;
++
ci
)
{
const
int
out_offset
=
((
h_offset
+
hi
)
*
width
+
w_offset
+
wi
)
*
channels
+
c_offset
+
ci
;
const
int
sum_idx
=
(
hi
*
w_count
+
wi
)
*
c_count
+
ci
;
output_ptr
[
out_offset
]
=
sum
[
sum_idx
];
}
}
}
}
};
// namespace
struct
Conv2dFunctorBase
{
Conv2dFunctorBase
(
const
int
*
strides
,
...
...
@@ -28,7 +190,7 @@ struct Conv2dFunctorBase {
relux_max_limit_
(
relux_max_limit
),
prelu_alpha_
(
prelu_alpha
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
strides_
;
// [stride_h, stride_w]
const
Padding
paddings_
;
const
int
*
dilations_
;
// [dilation_h, dilation_w]
const
ActivationType
activation_
;
...
...
@@ -51,8 +213,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit
,
prelu_alpha
)
{}
void
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWIO
void
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWOI
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -67,18 +229,21 @@ struct Conv2dFunctor : Conv2dFunctorBase {
paddings_
,
output_shape
.
data
(),
paddings
.
data
());
output
->
Resize
(
output_shape
);
in
dex_
t
batch
=
output
->
dim
(
0
);
in
dex_
t
height
=
output
->
dim
(
1
);
in
dex_
t
width
=
output
->
dim
(
2
);
in
dex_
t
channels
=
output
->
dim
(
3
);
int
batch
=
output
->
dim
(
0
);
int
height
=
output
->
dim
(
1
);
int
width
=
output
->
dim
(
2
);
int
channels
=
output
->
dim
(
3
);
in
dex_
t
input_batch
=
input
->
dim
(
0
);
in
dex_
t
input_height
=
input
->
dim
(
1
);
in
dex_
t
input_width
=
input
->
dim
(
2
);
in
dex_
t
input_channels
=
input
->
dim
(
3
);
int
input_batch
=
input
->
dim
(
0
);
int
input_height
=
input
->
dim
(
1
);
int
input_width
=
input
->
dim
(
2
);
int
input_channels
=
input
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
int
kernel_h
=
filter
->
dim
(
0
);
int
kernel_w
=
filter
->
dim
(
1
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
channels
,
filter
->
dim
(
2
),
" != "
,
channels
);
MACE_CHECK
(
filter
->
dim
(
3
)
==
input_channels
,
filter
->
dim
(
3
),
" != "
,
input_channels
);
int
stride_h
=
strides_
[
0
];
int
stride_w
=
strides_
[
1
];
...
...
@@ -88,11 +253,17 @@ struct Conv2dFunctor : Conv2dFunctorBase {
MACE_CHECK
(
batch
==
input_batch
,
"Input/Output batch size mismatch"
);
// The left-upper most offset of the padded input
int
padded_h_start
=
0
-
paddings
[
0
]
/
2
;
int
padded_w_start
=
0
-
paddings
[
1
]
/
2
;
index_t
padded_h_stop
=
input_height
+
paddings
[
0
]
-
paddings
[
0
]
/
2
;
index_t
padded_w_stop
=
input_width
+
paddings
[
1
]
-
paddings
[
1
]
/
2
;
int
padded_height
=
input_height
+
paddings
[
0
];
int
padded_width
=
input_width
+
paddings
[
1
];
Tensor
padded_input
;
// Keep this alive during kernel execution
if
(
paddings
[
0
]
>
0
||
paddings
[
1
]
>
0
)
{
ConstructNHWCInputWithPadding
(
input
,
paddings
.
data
(),
&
padded_input
);
input
=
&
padded_input
;
}
// padded_input.DebugPrint();
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
filter_mapper
(
filter
);
...
...
@@ -103,40 +274,338 @@ struct Conv2dFunctor : Conv2dFunctorBase {
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
auto
output_data
=
output
->
mutable_data
<
T
>
();
constexpr
int
inc_tile_size
=
4
;
// TODO Auto tuning these parameters
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
const
int
c_tile_size
=
4
;
const
int
h_tile_size
=
2
;
const
int
w_tile_size
=
2
;
#else
const
int
c_tile_size
=
4
;
const
int
h_tile_size
=
1
;
const
int
w_tile_size
=
2
;
#endif
const
int
c_tiles
=
RoundUpDiv
(
channels
,
c_tile_size
);
const
int
h_tiles
=
RoundUpDiv
(
height
,
h_tile_size
);
const
int
w_tiles
=
RoundUpDiv
(
width
,
w_tile_size
);
#pragma omp parallel for collapse(4)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
int
out_idx
=
((
n
*
height
+
h
)
*
width
+
w
)
*
channels
+
c
;
T
bias_channel
=
0.0
f
;
if
(
bias
)
bias_channel
=
bias_data
[
c
];
output_data
[
out_idx
]
=
bias_channel
;
T
sum
=
0.0
f
;
const
T
*
filter_ptr
=
filter_data
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
for
(
int
inc
=
0
;
inc
<
input_channels
;
++
inc
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
<
0
||
inh
>=
input_height
||
inw
<
0
||
inw
>=
input_width
)
{
MACE_CHECK
(
inh
>=
padded_h_start
&&
inh
<
padded_h_stop
&&
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
"Out of range read from input: "
,
inh
,
", "
,
inw
);
}
else
{
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
sum
+=
input_data
[
input_offset
]
*
*
filter_ptr
;
}
filter_ptr
+=
channels
;
for
(
int
cb
=
0
;
cb
<
c_tiles
;
++
cb
)
{
for
(
int
hb
=
0
;
hb
<
h_tiles
;
++
hb
)
{
for
(
int
wb
=
0
;
wb
<
w_tiles
;
++
wb
)
{
const
T
*
input_ptr
=
input_data
+
n
*
padded_height
*
padded_width
*
input_channels
;
T
*
output_ptr
=
output_data
+
n
*
height
*
width
*
channels
;
const
int
h_offset
=
hb
*
h_tile_size
;
const
int
w_offset
=
wb
*
w_tile_size
;
const
int
c_offset
=
cb
*
c_tile_size
;
const
int
h_count
=
std
::
min
(
h_tile_size
,
height
-
h_offset
);
const
int
w_count
=
std
::
min
(
w_tile_size
,
width
-
w_offset
);
const
int
c_count
=
std
::
min
(
c_tile_size
,
channels
-
c_offset
);
switch
(
c_count
)
{
case
1
:
switch
(
h_count
)
{
case
1
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
1
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
1
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
1
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
1
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
case
2
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
2
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
2
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
2
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
1
,
2
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported height tile: "
<<
h_count
;
}
}
break
;
case
2
:
switch
(
h_count
)
{
case
1
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
1
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
1
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
1
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
1
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
case
2
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
2
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
2
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
2
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
2
,
2
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported height tile: "
<<
h_count
;
}
break
;
case
3
:
switch
(
h_count
)
{
case
1
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
1
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
1
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
1
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
1
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
case
2
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
2
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
2
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
2
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
3
,
2
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported height tile: "
<<
h_count
;
}
break
;
case
4
:
switch
(
h_count
)
{
case
1
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
1
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
1
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
1
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
1
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
case
2
:
switch
(
w_count
)
{
case
1
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
2
,
1
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
2
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
2
,
2
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
3
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
2
,
3
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
case
4
:
Conv2dKernelFunc
<
T
,
inc_tile_size
,
4
,
2
,
4
>
(
input_ptr
,
filter_data
,
bias_data
,
output_ptr
,
h_offset
,
w_offset
,
c_offset
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channels
,
input_channels
,
width
,
padded_width
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported width tile: "
<<
w_count
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported height tile: "
<<
h_count
;
}
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported channel tile: "
<<
c_count
;
}
output_data
[
out_idx
]
+=
sum
;
}
}
}
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
780f5a60
...
...
@@ -17,7 +17,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
...
@@ -51,7 +51,8 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
// Note: TensorFlow may padded one more on the right/bottom side
...
...
@@ -59,12 +60,10 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
1
]
=
output_channels
;
...
...
@@ -73,7 +72,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
}
void
CalcNHWCPaddingAndOutputSize
(
const
index_t
*
input_shape
,
// NHWC
const
index_t
*
filter_shape
,
// HW
IO
const
index_t
*
filter_shape
,
// HW
OI
const
int
*
dilations
,
const
int
*
strides
,
Padding
padding
,
...
...
@@ -82,7 +81,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
...
@@ -98,7 +97,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
index_t
output_height
=
0
,
output_width
=
0
;
index_t
kernel_height
=
filter_shape
[
0
];
index_t
kernel_width
=
filter_shape
[
1
];
index_t
output_channels
=
filter_shape
[
3
];
index_t
output_channels
=
filter_shape
[
2
];
index_t
k_extent_height
=
(
kernel_height
-
1
)
*
dilations
[
0
]
+
1
;
index_t
k_extent_width
=
(
kernel_width
-
1
)
*
dilations
[
1
]
+
1
;
...
...
@@ -116,7 +115,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
output_height
=
(
input_shape
[
1
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
2
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
// Note: TensorFlow may padded one more on the right/bottom side
...
...
@@ -124,12 +124,10 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
1
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
2
]);
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
1
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
2
]);
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
1
]
=
output_height
;
...
...
@@ -146,7 +144,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
...
@@ -167,19 +165,18 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
}
void
ConstructInputWithPadding
(
const
Tensor
*
input_tensor
,
...
...
@@ -206,18 +203,18 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
output_tensor
->
Resize
(
output_shape
);
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_
ptr
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_
ptr
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
float
*
output_
data
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_
data
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
// Skip the padded top rows
if
(
padding_same_value
)
{
#define COPY_INPUT \
std::fill(output_ptr, output_ptr+padded_left, input[0]);
\
output_ptr += padded_left;
\
memcpy(output_ptr, input, width * sizeof(float));
\
output_ptr += width;
\
std::fill(output_ptr , output_ptr + padded_right, input[width-
1]); \
output_ptr
+= padded_right;
#define COPY_INPUT
\
std::fill(output_data, output_data + padded_left, input[0]);
\
output_data += padded_left;
\
memcpy(output_data, input, width * sizeof(float));
\
output_data += width;
\
std::fill(output_data, output_data + padded_right, input[width -
1]); \
output_data
+= padded_right;
const
int
padded_bottom
=
paddings
[
0
]
-
padded_top
;
const
int
padded_right
=
paddings
[
1
]
-
padded_left
;
...
...
@@ -239,19 +236,69 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
}
#undef COPY_INPUT
}
else
{
output_
ptr
+=
padded_top
*
output_width
;
output_
data
+=
padded_top
*
output_width
;
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
memcpy
(
output_
ptr
+
padded_left
,
input
,
width
*
sizeof
(
float
));
memcpy
(
output_
data
+
padded_left
,
input
,
width
*
sizeof
(
float
));
input
+=
width
;
output_
ptr
+=
output_width
;
output_
data
+=
output_width
;
}
// Skip the padded bottom in this channel and top in the next channel
output_
ptr
+=
paddings
[
0
]
*
output_width
;
output_
data
+=
paddings
[
0
]
*
output_width
;
}
}
}
}
}
// namespace kernels
}
// namespace mace
void
ConstructNHWCInputWithPadding
(
const
Tensor
*
input_tensor
,
const
int
*
paddings
,
Tensor
*
output_tensor
,
bool
padding_same_value
)
{
VLOG
(
1
)
<<
"input: "
<<
input_tensor
->
NumElements
();
Tensor
::
MappingGuard
input_mapper
(
input_tensor
);
const
float
*
input
=
input_tensor
->
data
<
float
>
();
const
index_t
*
input_shape
=
input_tensor
->
shape
().
data
();
index_t
batch
=
input_shape
[
0
];
index_t
height
=
input_shape
[
1
];
index_t
width
=
input_shape
[
2
];
index_t
channels
=
input_shape
[
3
];
std
::
vector
<
index_t
>
output_shape
(
{
batch
,
paddings
[
0
]
+
height
,
paddings
[
1
]
+
width
,
channels
});
const
int
output_height
=
output_shape
[
1
];
const
int
output_width
=
output_shape
[
2
];
const
int
padded_top
=
paddings
[
0
]
/
2
;
const
int
padded_left
=
paddings
[
1
]
/
2
;
output_tensor
->
Resize
(
output_shape
);
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_data
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
// Skip the padded top rows
if
(
padding_same_value
)
{
LOG
(
FATAL
)
<<
"Not implemented"
;
}
else
{
#pragma omp parallel for collapse(3)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
const
float
*
input_ptr
=
input
+
((
n
*
height
+
h
)
*
width
+
w
)
*
channels
;
float
*
output_ptr
=
output_data
+
((
n
*
output_height
+
h
+
padded_top
)
*
output_width
+
w
+
padded_left
)
*
channels
;
memcpy
(
output_ptr
,
input_ptr
,
channels
*
sizeof
(
float
));
}
}
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/conv_pool_2d_util.h
浏览文件 @
780f5a60
...
...
@@ -44,6 +44,12 @@ void ConstructInputWithPadding(const Tensor *input,
const
int
*
paddings
,
Tensor
*
output_tensor
,
bool
padding_same_value
=
false
);
void
ConstructNHWCInputWithPadding
(
const
Tensor
*
input
,
const
int
*
paddings
,
Tensor
*
output_tensor
,
bool
padding_same_value
=
false
);
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
780f5a60
...
...
@@ -64,8 +64,8 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
1
;
fake_filter_shape
[
2
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
3
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
...
...
mace/kernels/neon/addn_neon.cc
浏览文件 @
780f5a60
...
...
@@ -10,9 +10,11 @@ namespace kernels {
template
<
>
void
AddNFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
// TODO: neon mem copy
output_tensor
->
ResizeLike
(
input_tensors
[
0
]);
index_t
size
=
output_tensor
->
size
();
float
*
output_ptr
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_ptr
,
0
,
size
*
sizeof
(
float
));
...
...
mace/kernels/neon/avg_pooling_neon_3x3.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <float.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingAvgNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
float
avg_factors
[
4
]
=
{
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
};
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
const
float
*
r0
,
*
r1
,
*
r2
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
r2
=
r1
+
in_width
;
if
(
padding_left
>
0
)
{
if
(
padding_left
==
1
)
{
float
sum0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
sum1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
float
max2
=
std
::
max
(
r2
[
0
],
r2
[
1
]);
*
outptr
=
(
r0
[
0
]
+
r0
[
1
]
+
r1
[
0
]
+
r1
[
1
]
+
r2
[
0
]
+
r2
[
1
])
/
9.0
;
++
r0
;
++
r1
;
}
else
{
// padding_left == 2
*
outptr
=
(
r0
[
0
]
+
r1
[
0
]
+
r2
[
0
])
/
9.0
;
}
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
float32x4_t
factors
=
vld1q_f32
(
avg_factors
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
sum0
=
vaddq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
sum1
=
vaddq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
sum2
=
vaddq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
sum0
=
vaddq_f32
(
sum0
,
row02
);
sum1
=
vaddq_f32
(
sum1
,
row12
);
sum2
=
vaddq_f32
(
sum2
,
row22
);
float32x4_t
sum_result
=
vaddq_f32
(
vaddq_f32
(
sum0
,
sum1
),
sum2
);
float32x4_t
avg_result
=
vmulq_f32
(
sum_result
,
factors
);
vst1q_f32
(
outptr
,
avg_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
sum
=
0.0
;
for
(
int
kh
=
0
;
kh
<
3
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
3
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
sum
+=
input
[
input_offset
+
inh
*
in_width
+
inw
];
}
}
}
*
outptr
=
sum
/
9.0
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingAvgNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
float
avg_factors
[
4
]
=
{
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
};
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
r0
+
in_width
;
const
float
*
r2
=
r1
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
h
++
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
float32x4_t
factors
=
vld1q_f32
(
avg_factors
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
sum0
=
vaddq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
sum1
=
vaddq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
sum2
=
vaddq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
sum0
=
vaddq_f32
(
sum0
,
row02
);
sum1
=
vaddq_f32
(
sum1
,
row12
);
sum2
=
vaddq_f32
(
sum2
,
row22
);
float32x4_t
sum_result
=
vaddq_f32
(
vaddq_f32
(
sum0
,
sum1
),
sum2
);
float32x4_t
avg_result
=
vmulq_f32
(
sum_result
,
factors
);
vst1q_f32
(
outptr
,
avg_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
*
outptr
=
(
r0
[
0
]
+
r0
[
1
]
+
r0
[
2
]
+
r1
[
0
]
+
r1
[
1
]
+
r1
[
2
]
+
r2
[
0
]
+
r2
[
1
]
+
r2
[
2
])
/
9.0
;
r0
+=
2
;
r1
+=
2
;
r2
+=
2
;
outptr
++
;
}
r0
+=
1
+
in_width
;
r1
+=
1
+
in_width
;
r2
+=
1
+
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/batch_norm_neon.cc
浏览文件 @
780f5a60
...
...
@@ -15,6 +15,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const
Tensor
*
offset
,
const
Tensor
*
mean
,
const
Tensor
*
var
,
const
float
epsilon
,
Tensor
*
output
,
StatsFuture
*
future
)
{
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
...
...
@@ -26,8 +27,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
// new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset;
const
index_t
n
=
input
->
dim
(
0
);
const
index_t
channel
=
input
->
dim
(
1
);
const
index_t
sample_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
const
index_t
sample_size
=
input
->
dim
(
1
)
*
input
->
dim
(
2
);
const
index_t
channel
=
input
->
dim
(
3
);
const
float
*
input_ptr
=
input
->
data
<
float
>
();
const
float
*
scale_ptr
=
scale
->
data
<
float
>
();
...
...
@@ -36,36 +37,47 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const
float
*
var_ptr
=
var
->
data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
index_t
count
=
sample_size
>>
2
;
index_t
remain_count
=
sample_size
-
(
count
<<
2
);
const
index_t
ch_blks
=
channel
>>
2
;
const
index_t
remain_chs
=
channel
-
(
ch_blks
<<
2
);
std
::
vector
<
float
>
new_scale
(
channel
);
std
::
vector
<
float
>
new_offset
(
channel
);
#pragma omp parallel for
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
float
new_scale
=
scale_ptr
[
c
]
/
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon_
);
float
new_offset
=
offset_ptr
[
c
]
-
mean_ptr
[
c
]
*
new_scale
;
index_t
pos
=
c
*
sample_size
;
float32x4_t
new_scale_f
=
vdupq_n_f32
(
new_scale
);
float32x4_t
new_offset_f
=
vdupq_n_f32
(
new_offset
);
for
(
index_t
i
=
0
;
i
<
n
;
++
i
)
{
const
float
*
input_sample_ptr
=
input_ptr
+
pos
;
float
*
output_sample_ptr
=
output_ptr
+
pos
;
new_scale
[
c
]
=
scale_ptr
[
c
]
/
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon
);
new_offset
[
c
]
=
offset_ptr
[
c
]
-
mean_ptr
[
c
]
*
new_scale
[
c
];
}
for
(
index_t
j
=
0
;
j
<
count
;
++
j
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
n
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
sample_size
;
++
j
)
{
const
float
*
input_sample_ptr
=
input_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
float
*
output_sample_ptr
=
output_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
const
float
*
new_scale_ptr
=
new_scale
.
data
();
const
float
*
new_offset_ptr
=
new_offset
.
data
();
for
(
index_t
cb
=
0
;
cb
<
ch_blks
;
++
cb
)
{
float32x4_t
new_scale_f
=
vld1q_f32
(
new_scale_ptr
);
float32x4_t
new_offset_f
=
vld1q_f32
(
new_offset_ptr
);
float32x4_t
input_f
=
vld1q_f32
(
input_sample_ptr
);
float32x4_t
output_f
=
vfmaq_f32
(
new_offset_f
,
input_f
,
new_scale_f
);
vst1q_f32
(
output_sample_ptr
,
output_f
);
input_sample_ptr
+=
4
;
output_sample_ptr
+=
4
;
new_scale_ptr
+=
4
;
new_offset_ptr
+=
4
;
}
for
(
index_t
j
=
0
;
j
<
remain_count
;
++
j
)
{
*
output_sample_ptr
=
new_scale
*
*
input_sample_ptr
+
new_offset
;
for
(
index_t
c
=
(
ch_blks
<<
2
);
c
<
channel
;
++
c
)
{
*
output_sample_ptr
=
new_scale
[
c
]
*
*
input_sample_ptr
+
new_offset
[
c
]
;
++
output_sample_ptr
;
++
input_sample_ptr
;
++
new_scale_ptr
;
++
new_offset_ptr
;
}
pos
+=
channel
*
sample_size
;
}
}
};
}
// namespace kernels
}
//
namespace mace
}
// namespace mace
mace/kernels/neon/global_avg_pooling_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/global_avg_pooling.h"
#include <arm_neon.h>
namespace
mace
{
namespace
kernels
{
template
<
>
void
GlobalAvgPoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
float
*
input
,
const
index_t
*
input_shape
,
float
*
output
,
StatsFuture
*
future
)
{
index_t
batch
=
input_shape
[
0
];
index_t
channels
=
input_shape
[
1
];
index_t
height
=
input_shape
[
2
];
index_t
width
=
input_shape
[
3
];
index_t
image_size
=
height
*
width
;
index_t
input_offset
=
0
;
index_t
total_channels
=
batch
*
channels
;
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
total_channels
;
++
c
)
{
const
float
*
inptr
=
input
+
c
*
image_size
;
float
sum
=
0.0
;
int
num_vectors
=
image_size
>>
2
;
int
remain
=
image_size
-
(
num_vectors
<<
2
);
if
(
num_vectors
>
0
)
{
float
sum_out
[
4
]
=
{
0.0
,
0.0
,
0.0
,
0.0
};
float32x4_t
sum_vector
=
vld1q_f32
(
inptr
);
inptr
+=
4
;
for
(
int
n
=
1
;
n
<
num_vectors
;
++
n
)
{
float32x4_t
vector
=
vld1q_f32
(
inptr
);
sum_vector
=
vaddq_f32
(
sum_vector
,
vector
);
inptr
+=
4
;
}
vst1q_f32
(
sum_out
,
sum_vector
);
sum
=
sum_out
[
0
]
+
sum_out
[
1
]
+
sum_out
[
2
]
+
sum_out
[
3
];
}
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
sum
+=
*
inptr
;
++
inptr
;
}
output
[
c
]
=
sum
/
image_size
;
}
};
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/max_pooling_neon_2x2.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <float.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingMaxNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
,
*
r1
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
if
(
padding_left
>
0
)
{
*
outptr
=
std
::
max
(
r0
[
0
],
r1
[
0
]);
++
r0
;
++
r1
;
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4_t
r00
=
vld1q_f32
(
r0
);
float32x4_t
r10
=
vld1q_f32
(
r1
);
float32x4_t
r01
=
vld1q_f32
(
r0
+
4
);
float32x4_t
r11
=
vld1q_f32
(
r1
+
4
);
float32x4_t
max0
=
vmaxq_f32
(
r00
,
r10
);
float32x4_t
max1
=
vmaxq_f32
(
r01
,
r11
);
float32x4_t
max_result
=
vpmaxq_f32
(
max0
,
max1
);
vst1q_f32
(
outptr
,
max_result
);
r0
+=
8
;
r1
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
max
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
kh
=
0
;
kh
<
2
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
2
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
max
=
std
::
max
(
max
,
input
[
input_offset
+
inh
*
in_width
+
inw
]);
}
}
}
*
outptr
=
max
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingMaxNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
img0
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4_t
r00
=
vld1q_f32
(
r0
);
float32x4_t
r10
=
vld1q_f32
(
r1
);
float32x4_t
r01
=
vld1q_f32
(
r0
+
4
);
float32x4_t
r11
=
vld1q_f32
(
r1
+
4
);
float32x4_t
max0
=
vmaxq_f32
(
r00
,
r10
);
float32x4_t
max1
=
vmaxq_f32
(
r01
,
r11
);
float32x4_t
max_result
=
vpmaxq_f32
(
max0
,
max1
);
vst1q_f32
(
outptr
,
max_result
);
r0
+=
8
;
r1
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
float
max0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
max1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
*
outptr
=
std
::
max
(
max0
,
max1
);
r0
+=
2
;
r1
+=
2
;
outptr
++
;
}
r0
+=
in_width
;
r1
+=
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/max_pooling_neon_3x3.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingMaxNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
const
float
*
r0
,
*
r1
,
*
r2
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
r2
=
r1
+
in_width
;
if
(
padding_left
>
0
)
{
if
(
padding_left
==
1
)
{
float
max0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
max1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
float
max2
=
std
::
max
(
r2
[
0
],
r2
[
1
]);
*
outptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
++
r0
;
++
r1
;
}
else
{
// padding_left == 2
float
max_tmp
=
std
::
max
(
r0
[
0
],
r1
[
0
]);
*
outptr
=
std
::
max
(
max_tmp
,
r2
[
0
]);
}
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
max0
=
vmaxq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
max1
=
vmaxq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
max2
=
vmaxq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
max0
=
vmaxq_f32
(
max0
,
row02
);
max1
=
vmaxq_f32
(
max1
,
row12
);
max2
=
vmaxq_f32
(
max2
,
row22
);
float32x4_t
max_result
=
vmaxq_f32
(
vmaxq_f32
(
max0
,
max1
),
max2
);
vst1q_f32
(
outptr
,
max_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
max
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
kh
=
0
;
kh
<
3
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
3
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
max
=
std
::
max
(
max
,
input
[
input_offset
+
inh
*
in_width
+
inw
]);
}
}
}
*
outptr
=
max
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingMaxNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
r0
+
in_width
;
const
float
*
r2
=
r1
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
h
++
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
num_vectors
--
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
max0
=
vmaxq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
max1
=
vmaxq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
max2
=
vmaxq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
max0
=
vmaxq_f32
(
max0
,
row02
);
max1
=
vmaxq_f32
(
max1
,
row12
);
max2
=
vmaxq_f32
(
max2
,
row22
);
float32x4_t
max_result
=
vmaxq_f32
(
vmaxq_f32
(
max0
,
max1
),
max2
);
vst1q_f32
(
outptr
,
max_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
float
max0
=
std
::
max
(
std
::
max
(
r0
[
0
],
r0
[
1
]),
r0
[
2
]);
float
max1
=
std
::
max
(
std
::
max
(
r1
[
0
],
r1
[
1
]),
r1
[
2
]);
float
max2
=
std
::
max
(
std
::
max
(
r2
[
0
],
r2
[
1
]),
r2
[
2
]);
*
outptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
r0
+=
2
;
r1
+=
2
;
r2
+=
2
;
outptr
++
;
}
r0
+=
1
+
in_width
;
r1
+=
1
+
in_width
;
r2
+=
1
+
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/pooling_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/pooling.h"
namespace
mace
{
namespace
kernels
{
extern
void
PoolingMaxNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingAvgNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingMaxNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingAvgNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
#ifdef __COPY_MAKE_PADDING
extern
void
PoolingMaxNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingAvgNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingMaxNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingAvgNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
#endif
template
<
>
void
PoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
index_t
>
filter_shape
(
4
);
filter_shape
[
0
]
=
input_tensor
->
shape
()[
1
];
filter_shape
[
1
]
=
input_tensor
->
shape
()[
1
];
filter_shape
[
2
]
=
kernels_
[
0
];
filter_shape
[
3
]
=
kernels_
[
1
];
kernels
::
CalcPaddingAndOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
this
->
dilations_
,
strides_
,
this
->
padding_
,
output_shape
.
data
(),
paddings
.
data
());
output_tensor
->
Resize
(
output_shape
);
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
const
index_t
*
input_shape
=
input_tensor
->
shape
().
data
();
#ifdef __COPY_MAKE_PADDING
Tensor
padded_input
;
ConstructInputWithPadding
(
input_tensor
,
paddings
.
data
(),
&
padded_input
);
input
=
padded_input
.
data
<
float
>
();
input_shape
=
padded_input
.
shape
().
data
();
#endif
if
(
kernels_
[
0
]
==
2
&&
kernels_
[
1
]
==
2
&&
strides_
[
0
]
==
2
&&
strides_
[
1
]
==
2
)
{
// kernel_size: 2x2, strides: 2x2
if
(
pooling_type_
==
MAX
)
{
// MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingMaxNeonK2x2S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
else
{
// AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingAvgNeonK2x2S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
}
else
if
(
kernels_
[
0
]
==
3
&&
kernels_
[
1
]
==
3
&&
strides_
[
0
]
==
2
&&
strides_
[
1
]
==
2
)
{
// kernel_size: 3x3, strides: 2x2
if
(
pooling_type_
==
MAX
)
{
// MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingMaxNeonK3x3S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
else
{
// AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingAvgNeonK3x3S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
}
else
{
// not implement yet
PoolingFunctor
<
DeviceType
::
CPU
,
float
>
(
pooling_type_
,
kernels_
,
strides_
,
padding_
,
dilations_
)(
input_tensor
,
output_tensor
,
future
);
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/relu_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/relu.h"
#include <arm_neon.h>
namespace
mace
{
namespace
kernels
{
template
<
>
void
ActivationFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
index_t
size
=
input_tensor
->
size
();
if
(
max_limit_
<
0
)
{
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
kCostPerGroup
)
{
int64_t
count
=
std
::
min
(
static_cast
<
int64_t
>
(
kCostPerGroup
),
size
-
i
);
int
block
=
count
>>
2
;
int
remain
=
count
-
(
block
<<
2
);
const
float
*
inptr
=
input
+
i
;
float
*
outptr
=
output
+
i
;
float32x4_t
zero
=
vdupq_n_f32
(
0.
f
);
for
(;
block
>
0
;
--
block
)
{
float32x4_t
in
=
vld1q_f32
(
inptr
);
float32x4_t
out
=
vmaxq_f32
(
in
,
zero
);
vst1q_f32
(
outptr
,
out
);
inptr
+=
4
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
*
outptr
=
std
::
max
(
*
inptr
,
0.
f
);
++
inptr
;
++
outptr
;
}
}
}
else
{
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
kCostPerGroup
)
{
int64_t
count
=
std
::
min
(
static_cast
<
int64_t
>
(
kCostPerGroup
),
size
-
i
);
int
block
=
count
>>
2
;
int
remain
=
count
-
(
block
<<
2
);
const
float
*
inptr
=
input
+
i
;
float
*
outptr
=
output
+
i
;
float32x4_t
zero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vmax
=
vdupq_n_f32
(
max_limit_
);
for
(;
block
>
0
;
--
block
)
{
float32x4_t
in
=
vld1q_f32
(
inptr
);
float32x4_t
out
=
vmaxq_f32
(
in
,
zero
);
out
=
vminq_f32
(
out
,
vmax
);
vst1q_f32
(
outptr
,
out
);
inptr
+=
4
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
*
outptr
=
std
::
min
(
std
::
max
(
*
inptr
,
0.
f
),
max_limit_
);
++
inptr
;
++
outptr
;
}
}
}
};
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
780f5a60
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
o
c
*/
__kernel
void
filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
i
c
*/
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
out_channel,
__private
const
int
in_channel,
__write_only
image2d_t
output
)
{
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
...
...
@@ -13,23 +13,26 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out
_channel
+
out
_channel_idx
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
out_channel
+
out_channel_idx
)
*
in
_channel
+
in
_channel_idx
;
const
int
size
=
out_channel
-
out_channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
if
(
in_channel_idx
<
in_channel
)
{
if
(
out_channel_idx
<
out_channel
)
{
const
int
size
=
out_channel
-
out_channel_idx
;
if
(
size
<
4
)
{
switch
(
size
)
{
switch
(
size
)
{
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
values.z
=
*
(
input
+
offset
+
2
*
in_channel
)
;
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
values.y
=
*
(
input
+
offset
+
1
*
in_channel
)
;
case
1:
values.x
=
*
(
input
+
offset
)
;
}
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
values.w
=
*
(
input
+
offset
+
3
*
in_channel
)
;
values.z
=
*
(
input
+
offset
+
2
*
in_channel
)
;
values.y
=
*
(
input
+
offset
+
1
*
in_channel
)
;
values.x
=
*
(
input
+
offset
)
;
}
}
...
...
@@ -37,10 +40,10 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
}
__kernel
void
filter_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
h,
w,
ic,
o
c
*/
__kernel
void
filter_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
h,
w,
oc,
i
c
*/
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
out_channel,
__private
const
int
in_channel,
__read_only
image2d_t
input
)
{
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
...
...
@@ -50,29 +53,31 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out
_channel
+
out
_channel_idx
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
out_channel
+
out_channel_idx
)
*
in
_channel
+
in
_channel_idx
;
if
(
in_channel_idx
<
in
_channel
)
{
if
(
out_channel_idx
<
out
_channel
)
{
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER,
coord
)
;
const
int
size
=
(
out_channel
-
out_channel_idx
)
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
output[offset
+2]
=
values.s2
;
output[offset
+
2
*
in_channel]
=
values.z
;
case
2:
output[offset
+1]
=
values.s1
;
output[offset
+
1
*
in_channel]
=
values.y
;
case
1:
output[offset]
=
values.
s0
;
output[offset]
=
values.
x
;
}
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
output[offset
+
3
*
in_channel]
=
values.w
;
output[offset
+
2
*
in_channel]
=
values.z
;
output[offset
+
1
*
in_channel]
=
values.y
;
output[offset]
=
values.x
;
}
}
}
__kernel
void
dw_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
m
*/
__private
const
int
filter_w,
__private
const
int
in_channel,
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
780f5a60
...
...
@@ -149,8 +149,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
1
;
fake_filter_shape
[
2
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
3
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
780f5a60
...
...
@@ -19,12 +19,12 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
}
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
void
CalConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HW
IO
*/
void
CalConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HW
OI
*/
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
2
],
4
);
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
3
]);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
3
],
4
);
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
2
]);
}
// [H * W * M, (Ic + 3) / 4]
...
...
@@ -179,6 +179,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
// TODO tuning these magic numbers
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
...
...
@@ -200,7 +201,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
{
4
,
15
,
8
,
1
},
};
};
cl
::
Event
event
;
...
...
mace/ops/activation.cc
浏览文件 @
780f5a60
...
...
@@ -13,14 +13,6 @@ void Register_Activation(OperatorRegistry *op_registry) {
.
Build
(),
ActivationOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/activation_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -298,14 +298,15 @@ static void SigmoidBenchmark(
} \
BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SIGMOID(N, C, H, W, TYPE) \
BM_SIGMOID_MACRO(N, C, H, W, TYPE, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, TYPE, OPENCL);
BM_SIGMOID
(
1
,
1
,
512
,
512
,
float
);
BM_SIGMOID
(
1
,
3
,
128
,
128
,
float
);
BM_SIGMOID
(
1
,
3
,
512
,
512
,
float
);
BM_SIGMOID
(
1
,
32
,
112
,
112
,
float
);
BM_SIGMOID
(
1
,
64
,
256
,
256
,
float
);
#define BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \
BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL);
BM_SIGMOID
(
1
,
1
,
512
,
512
);
BM_SIGMOID
(
1
,
3
,
128
,
128
);
BM_SIGMOID
(
1
,
3
,
512
,
512
);
BM_SIGMOID
(
1
,
32
,
112
,
112
);
BM_SIGMOID
(
1
,
64
,
256
,
256
);
}
// namespace mace
mace/ops/activation_test.cc
浏览文件 @
780f5a60
...
...
@@ -53,10 +53,6 @@ void TestSimpleRelu() {
TEST_F
(
ActivationOpTest
,
CPUSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -104,12 +100,6 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
TestUnalignedSimpleRelu
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONUnalignedSimpleRelu
)
{
TestUnalignedSimpleRelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLUnalignedSimpleRelu
)
{
TestUnalignedSimpleRelu
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -160,10 +150,6 @@ void TestSimpleRelux() {
TEST_F
(
ActivationOpTest
,
CPUSimple
)
{
TestSimpleRelux
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimple
)
{
TestSimpleRelux
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimple
)
{
TestSimpleRelux
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -216,12 +202,6 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
TestSimpleReluRelux
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleRelux
)
{
TestSimpleReluRelux
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelux
)
{
TestSimpleReluRelux
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -272,12 +252,6 @@ void TestSimplePrelu() {
TEST_F
(
ActivationOpTest
,
CPUSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -329,10 +303,6 @@ void TestSimpleTanh() {
TEST_F
(
ActivationOpTest
,
CPUSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
OPENCL
>
();
}
...
...
@@ -387,12 +357,6 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
TestSimpleSigmoid
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleSigmoid
)
{
TestSimpleSigmoid
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleSigmoid
)
{
TestSimpleSigmoid
<
DeviceType
::
OPENCL
>
();
}
...
...
mace/ops/addn_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -65,16 +65,16 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, NEON); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
BM_ADDN
(
2
,
1
,
256
,
256
,
32
,
float
);
BM_ADDN
(
2
,
1
,
128
,
128
,
32
,
float
);
// BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN
(
4
,
1
,
128
,
128
,
3
,
float
);
BM_ADDN
(
2
,
1
,
256
,
256
,
3
,
float
);
BM_ADDN
(
2
,
1
,
512
,
512
,
3
,
float
);
// BM_ADDN(4, 1, 240, 240, 256, half);
BM_ADDN
(
2
,
1
,
256
,
256
,
32
);
BM_ADDN
(
2
,
1
,
128
,
128
,
32
);
BM_ADDN
(
4
,
1
,
128
,
128
,
3
);
BM_ADDN
(
2
,
1
,
256
,
256
,
3
);
BM_ADDN
(
2
,
1
,
512
,
512
,
3
);
}
//
namespace mace
}
// namespace mace
mace/ops/addn_test.cc
浏览文件 @
780f5a60
...
...
@@ -33,12 +33,8 @@ void SimpleAdd2() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
CPU
>
();
}
/*
TEST_F
(
AddnOpTest
,
NEONSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
NEON
>
();
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
*/
template
<
DeviceType
D
>
void
SimpleAdd3
()
{
// Construct graph
...
...
@@ -65,9 +61,7 @@ void SimpleAdd3() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
CPU
>
();
}
/*
TEST_F
(
AddnOpTest
,
NEONSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
NEON
>
();
}
*/
template
<
DeviceType
D
>
void
RandomTest
()
{
...
...
mace/ops/batch_norm_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -82,21 +82,24 @@ static void BatchNorm(
} \
BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_NORM(N, C, H, W, TYPE) \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
BM_BATCH_NORM
(
1
,
1
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
3
,
128
,
128
,
float
);
BM_BATCH_NORM
(
1
,
3
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
32
,
112
,
112
,
float
);
BM_BATCH_NORM
(
1
,
64
,
256
,
256
,
float
);
BM_BATCH_NORM
(
1
,
64
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
128
,
56
,
56
,
float
);
BM_BATCH_NORM
(
1
,
128
,
256
,
256
,
float
);
BM_BATCH_NORM
(
1
,
256
,
14
,
14
,
float
);
BM_BATCH_NORM
(
1
,
512
,
14
,
14
,
float
);
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
,
float
);
BM_BATCH_NORM
(
32
,
1
,
256
,
256
,
float
);
BM_BATCH_NORM
(
32
,
3
,
256
,
256
,
float
);
}
// namespace mace
BM_BATCH_NORM
(
1
,
1
,
512
,
512
);
BM_BATCH_NORM
(
1
,
3
,
128
,
128
);
BM_BATCH_NORM
(
1
,
3
,
512
,
512
);
BM_BATCH_NORM
(
1
,
32
,
112
,
112
);
BM_BATCH_NORM
(
1
,
64
,
256
,
256
);
BM_BATCH_NORM
(
1
,
64
,
512
,
512
);
BM_BATCH_NORM
(
1
,
128
,
56
,
56
);
BM_BATCH_NORM
(
1
,
128
,
256
,
256
);
BM_BATCH_NORM
(
1
,
256
,
14
,
14
);
BM_BATCH_NORM
(
1
,
512
,
14
,
14
);
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
);
BM_BATCH_NORM
(
32
,
1
,
256
,
256
);
BM_BATCH_NORM
(
32
,
3
,
256
,
256
);
}
// namespace mace
mace/ops/batch_norm_test.cc
浏览文件 @
780f5a60
...
...
@@ -72,23 +72,18 @@ void Simple() {
TEST_F
(
BatchNormOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
/*
TEST_F(BatchNormOpTest, SimpleNEON) {
Simple<DeviceType::NEON>();
}
*/
TEST_F
(
BatchNormOpTest
,
SimpleNEON
)
{
Simple
<
DeviceType
::
NEON
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
/*
TEST_F
(
BatchNormOpTest
,
SimpleRandomNeon
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
+
rand
()
%
10
;
index_t channels = 3 + rand() % 50;
index_t
height
=
64
;
index_t
width
=
64
;
index_t
channels
=
3
+
rand
()
%
50
;
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
...
...
@@ -97,18 +92,17 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
Input("Epsilon"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input",
{batch, channels, height,
width
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
net
.
RunOp
();
...
...
@@ -139,18 +133,17 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
Input("Epsilon"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input",
{batch, channels, height,
width
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
net
.
RunOp
();
...
...
@@ -164,7 +157,6 @@ width});
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-2
);
}
*/
TEST_F
(
BatchNormOpTest
,
SimpleRandomOPENCL
)
{
srand
(
time
(
NULL
));
...
...
mace/ops/batch_to_space_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -47,10 +47,10 @@ static void BMBatchToSpace(
} \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG
, TYPE
) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG,
TYPE
, OPENCL);
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG,
float
, OPENCL);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
,
float
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
,
float
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
,
float
);
}
// namespace mace
\ No newline at end of file
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
}
// namespace mace
mace/ops/bias_add.cc
浏览文件 @
780f5a60
...
...
@@ -13,16 +13,6 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
.
Build
(),
BiasAddOp
<
DeviceType
::
CPU
,
float
>
);
/*
#if __ARM_NEON
REGISTER_OPERATOR(op_registry,OpKeyBuilder("BiasAdd")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
*/
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/bias_add_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -59,21 +59,22 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
} \
BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BIAS_ADD(N, C, H, W, TYPE) \
BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL);
BM_BIAS_ADD
(
1
,
1
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
3
,
128
,
128
,
float
);
BM_BIAS_ADD
(
1
,
3
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
32
,
112
,
112
,
float
);
BM_BIAS_ADD
(
1
,
64
,
256
,
256
,
float
);
BM_BIAS_ADD
(
1
,
64
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
128
,
56
,
56
,
float
);
BM_BIAS_ADD
(
1
,
128
,
256
,
256
,
float
);
BM_BIAS_ADD
(
1
,
256
,
14
,
14
,
float
);
BM_BIAS_ADD
(
1
,
512
,
14
,
14
,
float
);
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
,
float
);
BM_BIAS_ADD
(
32
,
1
,
256
,
256
,
float
);
BM_BIAS_ADD
(
32
,
3
,
256
,
256
,
float
);
}
//
namespace mace
BM_BIAS_ADD
(
1
,
1
,
512
,
512
);
BM_BIAS_ADD
(
1
,
3
,
128
,
128
);
BM_BIAS_ADD
(
1
,
3
,
512
,
512
);
BM_BIAS_ADD
(
1
,
32
,
112
,
112
);
BM_BIAS_ADD
(
1
,
64
,
256
,
256
);
BM_BIAS_ADD
(
1
,
64
,
512
,
512
);
BM_BIAS_ADD
(
1
,
128
,
56
,
56
);
BM_BIAS_ADD
(
1
,
128
,
256
,
256
);
BM_BIAS_ADD
(
1
,
256
,
14
,
14
);
BM_BIAS_ADD
(
1
,
512
,
14
,
14
);
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
);
BM_BIAS_ADD
(
32
,
1
,
256
,
256
);
BM_BIAS_ADD
(
32
,
3
,
256
,
256
);
}
// namespace mace
mace/ops/buffer_to_image.cc
浏览文件 @
780f5a60
...
...
@@ -20,4 +20,4 @@ void Register_BufferToImage(OperatorRegistry *op_registry) {
BufferToImageOp
<
DeviceType
::
OPENCL
,
half
>
);
}
}
//
namespace mace
}
// namespace mace
mace/ops/buffer_to_image.h
浏览文件 @
780f5a60
...
...
@@ -35,5 +35,5 @@ class BufferToImageOp: public Operator<D, T> {
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
//
namespace mace
#endif
//
MACE_OPS_BUFFER_TO_IMAGE_H_
}
//
namespace mace
#endif
//
MACE_OPS_BUFFER_TO_IMAGE_H_
mace/ops/conv_2d.cc
浏览文件 @
780f5a60
...
...
@@ -13,20 +13,6 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
half
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -29,7 +29,7 @@ static void Conv2d(int iters,
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
channels
,
output_
channels
});
{
kernel_h
,
kernel_w
,
output_channels
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
if
(
D
==
DeviceType
::
OPENCL
)
{
...
...
@@ -92,50 +92,46 @@ static void Conv2d(int iters,
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
half
);
//// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
//// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
//// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
SAME
,
64
,
half
);
BM_CONV_2D
(
1
,
512
,
64
,
64
,
1
,
1
,
1
,
SAME
,
256
,
half
);
BM_CONV_2D
(
1
,
128
,
16
,
16
,
3
,
3
,
1
,
VALID
,
32
,
half
);
BM_CONV_2D
(
1
,
128
,
64
,
64
,
3
,
3
,
1
,
VALID
,
32
,
half
);
BM_CONV_2D
(
1
,
128
,
128
,
128
,
3
,
3
,
1
,
VALID
,
32
,
half
);
// Test RGB <-> YUV
// BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
// BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
// BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
// BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad
// alignments
// BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
// BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
// BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
// BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
// BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
// BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
// BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
// BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
// BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
}
// namespace mace
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
);
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
SAME
,
64
);
BM_CONV_2D
(
1
,
512
,
64
,
64
,
1
,
1
,
1
,
SAME
,
256
);
BM_CONV_2D
(
1
,
128
,
16
,
16
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
128
,
64
,
64
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
128
,
128
,
128
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
VALID
,
3
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
VALID
,
128
);
// Test bad alignments
BM_CONV_2D
(
1
,
3
,
512
,
512
,
1
,
1
,
1
,
VALID
,
3
);
BM_CONV_2D
(
1
,
32
,
112
,
112
,
1
,
1
,
1
,
VALID
,
64
);
BM_CONV_2D
(
1
,
64
,
56
,
56
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
256
,
28
,
28
,
1
,
1
,
1
,
VALID
,
256
);
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
VALID
,
1024
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
3
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
128
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
3
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
SAME
,
128
);
}
// namespace mace
mace/ops/conv_2d_test.cc
浏览文件 @
780f5a60
...
...
@@ -10,81 +10,6 @@ using namespace mace;
class
Conv2dOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
void
TestSimple3x3VALID
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add args
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
template
<
DeviceType
D
>
void
TestSimple3x3SAME
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
3
,
3
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#if __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
NEON
>
();
TestSimple3x3SAME
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
...
...
@@ -93,7 +18,7 @@ void TestNHWCSimple3x3VALID() {
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
...
...
@@ -150,7 +75,7 @@ void TestNHWCSimple3x3SAME() {
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
...
...
@@ -211,42 +136,6 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3SAME
<
DeviceType
::
OPENCL
,
float
>
();
}
template
<
DeviceType
D
>
void
TestSimple3x3WithoutBias
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#ifdef __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONWithouBias
)
{
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
...
...
@@ -256,7 +145,7 @@ void TestNHWCSimple3x3WithoutBias() {
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
...
...
@@ -309,47 +198,6 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
,
float
>
();
}
template
<
DeviceType
D
>
static
void
TestCombined3x3
()
{
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
5
,
5
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
2
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
2
,
3
,
3
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
,
4.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
9.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#ifdef __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONCombined
)
{
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
static
void
TestNHWCCombined3x3
()
{
// Construct graph
...
...
@@ -362,9 +210,9 @@ static void TestNHWCCombined3x3() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
2
},
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
{
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
...
...
@@ -436,8 +284,8 @@ void TestConv1x1() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
1
,
5
,
2
},
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1
.0
f
,
2.0
f
});
"Filter"
,
{
1
,
1
,
2
,
5
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
2.0
f
,
2.0
f
,
2.0
f
,
2
.0
f
,
2.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
...
...
@@ -522,7 +370,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
@@ -606,7 +454,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
(
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
...
...
@@ -614,7 +462,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
...
...
@@ -748,7 +596,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
mace/ops/depthwise_conv2d.cc
浏览文件 @
780f5a60
...
...
@@ -13,14 +13,6 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
780f5a60
...
...
@@ -288,12 +288,6 @@ void TestNxNS12(const index_t height, const index_t width) {
}
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
4
,
4
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
4
,
4
);
}
...
...
@@ -302,13 +296,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
4
,
4
);
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
128
,
128
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
128
,
128
);
...
...
@@ -319,12 +306,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
128
,
128
);
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
107
,
113
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
107
,
113
);
}
...
...
mace/ops/depthwise_conv_2d_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -89,21 +89,22 @@ static void DepthwiseConv2d(int iters,
BENCHMARK( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
,
float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1
, float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
,
float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1
, float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
}
// namespace mace
mace/ops/folded_batch_norm.cc
浏览文件 @
780f5a60
...
...
@@ -14,14 +14,6 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
OPENCL
)
...
...
mace/ops/fused_conv_2d.cc
浏览文件 @
780f5a60
...
...
@@ -13,12 +13,6 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
.
Build
(),
FusedConv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FusedConv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FusedConv2dOp
<
DeviceType
::
CPU
,
half
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FusedConv2D"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/fused_conv_2d_test.cc
浏览文件 @
780f5a60
...
...
@@ -298,7 +298,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
@@ -375,7 +375,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
(
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
...
...
@@ -383,7 +383,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
...
...
@@ -462,7 +462,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
@@ -540,7 +540,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
@@ -622,7 +622,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// run on cpu
...
...
mace/ops/global_avg_pooling.cc
浏览文件 @
780f5a60
...
...
@@ -12,14 +12,6 @@ void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
GlobalAvgPoolingOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"GlobalAvgPooling"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
GlobalAvgPoolingOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
}
}
// namespace mace
mace/ops/global_avg_pooling_test.cc
浏览文件 @
780f5a60
...
...
@@ -31,29 +31,3 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#if __ARM_NEON
TEST_F
(
GlobalAvgPoolingOpTest
,
3
x7x7_NEON
)
{
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"GlobalAvgPooling"
,
"GlobalAvgPoolingTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
std
::
vector
<
float
>
input
(
147
);
for
(
int
i
=
0
;
i
<
147
;
++
i
)
{
input
[
i
]
=
i
/
49
+
1
;
}
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
3
,
7
,
7
},
input
);
// Run
net
.
RunOp
(
DeviceType
::
NEON
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
3
,
1
,
1
},
{
1
,
2
,
3
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#endif
mace/ops/matmul_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -61,10 +61,10 @@ static void MatMulBenchmark(
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W
, TYPE)
\
BM_MATMUL_MACRO(N, H, C, W,
TYPE
, OPENCL);
#define BM_MATMUL(N, H, C, W
)
\
BM_MATMUL_MACRO(N, H, C, W,
half
, OPENCL);
BM_MATMUL
(
16
,
32
,
128
,
49
,
half
);
BM_MATMUL
(
16
,
32
,
128
,
961
,
half
);
BM_MATMUL
(
16
,
32
,
128
,
3969
,
half
);
}
//
namespace mace
BM_MATMUL
(
16
,
32
,
128
,
49
);
BM_MATMUL
(
16
,
32
,
128
,
961
);
BM_MATMUL
(
16
,
32
,
128
,
3969
);
}
// namespace mace
mace/ops/pooling.cc
浏览文件 @
780f5a60
...
...
@@ -18,14 +18,6 @@ void Register_Pooling(OperatorRegistry *op_registry) {
.
Build
(),
PoolingOp
<
DeviceType
::
CPU
,
half
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/resize_bilinear.cc
浏览文件 @
780f5a60
...
...
@@ -13,14 +13,6 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/resize_bilinear_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -69,18 +69,18 @@ static void ResizeBilinearBenchmark(int iters,
BENCHMARK( \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE)
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, OPENCL); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, OPENCL);
// SNPE 835 GPU: 6870us
BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
,
float
);
BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
,
float
);
BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
,
float
);
BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
,
float
);
BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
);
BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
);
BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
);
BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
);
}
//
namespace mace
}
// namespace mace
mace/ops/softmax_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -55,13 +55,14 @@ static void SoftmaxBenchmark(
} \
BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SOFTMAX(N, C, H, W, TYPE) \
BM_SOFTMAX_MACRO(N, C, H, W, TYPE, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, TYPE, OPENCL);
#define BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \
BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL);
BM_SOFTMAX
(
1
,
1
,
512
,
512
,
float
);
BM_SOFTMAX
(
1
,
3
,
128
,
128
,
float
);
BM_SOFTMAX
(
1
,
3
,
512
,
512
,
float
);
BM_SOFTMAX
(
1
,
32
,
112
,
112
,
float
);
BM_SOFTMAX
(
1
,
64
,
256
,
256
,
float
);
BM_SOFTMAX
(
1
,
1
,
512
,
512
);
BM_SOFTMAX
(
1
,
3
,
128
,
128
);
BM_SOFTMAX
(
1
,
3
,
512
,
512
);
BM_SOFTMAX
(
1
,
32
,
112
,
112
);
BM_SOFTMAX
(
1
,
64
,
256
,
256
);
}
// namespace mace
mace/ops/space_to_batch_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -49,10 +49,10 @@ static void BMSpaceToBatch(
BENCHMARK( \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE
, TYPE
) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE,
TYPE
, OPENCL);
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE,
float
, OPENCL);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
,
float
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
,
float
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
,
float
);
}
// namespace mace
\ No newline at end of file
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
}
// namespace mace
mace/ops/winograd_convolution_test.cc
浏览文件 @
780f5a60
...
...
@@ -19,9 +19,9 @@ void TransposeFilter(const std::vector<float> &input,
const
float
*
input_ptr
=
input
.
data
();
for
(
index_t
h
=
0
;
h
<
input_shape
[
0
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
input_shape
[
1
];
++
w
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
2
];
++
i
c
)
{
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
3
];
++
o
c
)
{
int
offset
=
((
oc
*
input_shape
[
2
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
2
];
++
o
c
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
3
];
++
i
c
)
{
int
offset
=
((
oc
*
input_shape
[
3
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
output
[
offset
]
=
*
input_ptr
;
++
input_ptr
;
}
...
...
@@ -43,7 +43,7 @@ void WinogradConvolution(const index_t batch,
OpsTestNet
net
;
// Add input data
std
::
vector
<
float
>
filter_data
;
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
in_channels
,
out
_channels
};
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
out_channels
,
in
_channels
};
GenerateRandomRealTypeData
<
float
>
(
filter_shape
,
filter_data
);
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
in_channels
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
filter_shape
,
filter_data
);
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
780f5a60
...
...
@@ -48,12 +48,12 @@ static void BMWinogradTransform(
BENCHMARK( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C
, TYPE
) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C,
TYPE
, OPENCL);
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C,
half
, OPENCL);
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
);
template
<
DeviceType
D
,
typename
T
>
static
void
BMWinogradInverseTransform
(
...
...
@@ -100,11 +100,11 @@ static void BMWinogradInverseTransform(
BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C
, TYPE
) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C,
TYPE
, OPENCL);
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C,
half
, OPENCL);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
);
}
// namespace mace
\ No newline at end of file
}
// namespace mace
mace/utils/utils.h
浏览文件 @
780f5a60
...
...
@@ -30,6 +30,11 @@ Integer RoundUpDiv8(Integer i) {
return
(
i
+
7
)
>>
3
;
}
template
<
typename
Integer
>
Integer
RoundUpDiv
(
Integer
i
,
Integer
factor
)
{
return
(
i
+
factor
-
1
)
/
factor
;
}
template
<
typename
Integer
>
Integer
CeilQuotient
(
Integer
a
,
Integer
b
)
{
return
(
a
+
b
-
1
)
/
b
;
...
...
tools/bazel-adb-run.sh
浏览文件 @
780f5a60
...
...
@@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//}
BAZEL_BIN_PATH
=
bazel-bin/
$BAZEL_BIN_PATH
BIN_NAME
=
`
echo
$BAZEL_TARGET
|
cut
-d
:
-f2
`
ANDROID_ABI
=
arm64-v8a
ANDROID_ABI
=
armeabi-v7a
ANDROID_ABI
=
arm64-v8a
STRIP
=
"--strip always"
VLOG_LEVEL
=
0
PROFILING
=
"1"
...
...
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
--copt
=
"-D_GLIBCXX_USE_C99_MATH_TR1"
\
--copt
=
"-DMACE_DISABLE_NO_TUNING_WARNING"
\
--copt
=
"-Werror=return-type"
\
--define
neon
=
false
\
--copt
=
"-O3"
\
--define
neon
=
true
\
--define
openmp
=
true
if
[
$?
-ne
0
]
;
then
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录