Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
aba88a23
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
aba88a23
编写于
2月 09, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Improve CPU implementations
上级
d394298e
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
247 addition
and
233 deletion
+247
-233
mace/core/common.h
mace/core/common.h
+0
-2
mace/core/tensor.h
mace/core/tensor.h
+6
-0
mace/kernels/BUILD
mace/kernels/BUILD
+0
-1
mace/kernels/addn.h
mace/kernels/addn.h
+54
-11
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+5
-7
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+166
-41
mace/kernels/neon/addn_neon.cc
mace/kernels/neon/addn_neon.cc
+0
-57
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+3
-4
mace/ops/addn.cc
mace/ops/addn.cc
+0
-8
mace/ops/addn_benchmark.cc
mace/ops/addn_benchmark.cc
+0
-1
mace/ops/addn_test.cc
mace/ops/addn_test.cc
+0
-4
mace/ops/batch_norm_benchmark.cc
mace/ops/batch_norm_benchmark.cc
+1
-1
mace/ops/batch_norm_test.cc
mace/ops/batch_norm_test.cc
+0
-84
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+1
-1
mace/ops/depthwise_conv_2d_benchmark.cc
mace/ops/depthwise_conv_2d_benchmark.cc
+11
-11
未找到文件。
mace/core/common.h
浏览文件 @
aba88a23
...
...
@@ -32,6 +32,4 @@ typedef int64_t index_t;
#define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
#define kCostPerGroup 10240
#endif // MACE_CORE_COMMON_H_
mace/core/tensor.h
浏览文件 @
aba88a23
...
...
@@ -325,6 +325,12 @@ class Tensor {
}
}
}
MappingGuard
(
MappingGuard
&&
other
)
{
tensor_
=
other
.
tensor_
;
other
.
tensor_
=
nullptr
;
}
MappingGuard
(
const
MappingGuard
&
other
)
=
delete
;
MappingGuard
&
operator
=
(
const
MappingGuard
&
other
)
=
delete
;
~
MappingGuard
()
{
if
(
tensor_
!=
nullptr
)
tensor_
->
Unmap
();
}
...
...
mace/kernels/BUILD
浏览文件 @
aba88a23
...
...
@@ -15,7 +15,6 @@ cc_library(
"*.cc"
,
"opencl/*.cc"
,
])
+
if_neon_enabled
(
glob
([
"neon/addn_neon.cc"
,
"neon/batch_norm_neon.cc"
,
])),
hdrs
=
glob
([
...
...
mace/kernels/addn.h
浏览文件 @
aba88a23
...
...
@@ -8,29 +8,71 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
namespace
mace
{
namespace
kernels
{
namespace
{
constexpr
int
kCostPerGroup
=
1024
;
}
// namespace
template
<
DeviceType
D
,
typename
T
>
struct
AddNFunctor
{
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
output_tensor
->
ResizeLike
(
input_tensors
[
0
]);
index_t
size
=
output_tensor
->
size
();
Tensor
::
MappingGuard
output_map
(
output_tensor
);
index_t
size
=
input_tensors
[
0
]
->
size
();
T
*
output_ptr
=
output_tensor
->
mutable_data
<
T
>
();
memset
(
output_ptr
,
0
,
size
*
sizeof
(
T
));
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_data
,
0
,
size
*
sizeof
(
float
));
int
n
=
input_tensors
.
size
();
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
Tensor
::
MappingGuard
input_map
(
input_tensors
[
i
]);
const
T
*
input_ptr
=
input_tensors
[
i
]
->
data
<
T
>
();
for
(
index_t
j
=
0
;
j
<
size
;
++
j
)
{
output_ptr
[
j
]
+=
input_ptr
[
j
];
int64_t
cost
=
size
*
n
;
int64_t
groups
=
1
;
if
(
cost
>
kCostPerGroup
)
{
groups
=
cost
/
kCostPerGroup
;
}
int64_t
element_per_group
=
size
/
groups
;
std
::
vector
<
Tensor
::
MappingGuard
>
mappers
;
for
(
int64_t
i
=
0
;
i
<
n
;
++
i
)
{
mappers
.
emplace_back
(
Tensor
::
MappingGuard
(
input_tensors
[
i
]));
}
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
element_per_group
)
{
int64_t
count
=
std
::
min
(
element_per_group
,
size
-
i
);
int
nn
=
count
>>
2
;
int
remain
=
count
-
(
nn
<<
2
);
for
(
int64_t
j
=
0
;
j
<
n
;
++
j
)
{
const
float
*
input_data
=
input_tensors
[
j
]
->
data
<
float
>
();
const
float
*
input_ptr
=
input_data
+
i
;
float
*
output_ptr
=
output_data
+
i
;
for
(
int
k
=
0
;
k
<
nn
;
++
k
)
{
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t
in
=
vld1q_f32
(
input_ptr
);
float32x4_t
out
=
vld1q_f32
(
output_ptr
);
out
=
vaddq_f32
(
out
,
in
);
vst1q_f32
(
output_ptr
,
out
);
#else
for
(
int
m
=
0
;
m
<
4
;
++
m
)
{
output_ptr
[
m
]
+=
input_ptr
[
m
];
}
#endif
input_ptr
+=
4
;
output_ptr
+=
4
;
}
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
*
output_ptr
+=
*
input_ptr
;
++
input_ptr
;
++
output_ptr
;
}
}
}
}
...
...
@@ -45,7 +87,8 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
template
<
typename
T
>
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
T
>
{
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
StatsFuture
*
future
);
Tensor
*
output_tensor
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
};
...
...
mace/kernels/batch_norm.h
浏览文件 @
aba88a23
...
...
@@ -70,8 +70,8 @@ struct BatchNormFunctor : BatchNormFunctorBase {
const
T
*
offset_ptr
=
offset
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
vector
<
T
>
new_scale
;
vector
<
T
>
new_offset
;
std
::
vector
<
T
>
new_scale
;
std
::
vector
<
T
>
new_offset
;
if
(
!
folded_constant_
)
{
new_scale
.
resize
(
channels
);
new_offset
.
resize
(
channels
);
...
...
@@ -86,6 +86,8 @@ struct BatchNormFunctor : BatchNormFunctorBase {
}
}
const
T
*
scale_data
=
folded_constant_
?
scale_ptr
:
new_scale
.
data
();
const
T
*
offset_data
=
folded_constant_
?
offset_ptr
:
new_offset
.
data
();
#pragma omp parallel for collapse(4)
for
(
index_t
n
=
0
;
n
<
batch
;
++
n
)
{
...
...
@@ -93,11 +95,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
for
(
index_t
w
=
0
;
w
<
width
;
++
w
)
{
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
index_t
pos
=
(((
n
*
height
)
+
h
)
*
width
+
w
)
*
channels
+
c
;
if
(
folded_constant_
)
{
output_ptr
[
pos
]
=
scale_ptr
[
c
]
*
input_ptr
[
pos
]
+
offset_ptr
[
c
];
}
else
{
output_ptr
[
pos
]
=
new_scale
[
c
]
*
input_ptr
[
pos
]
+
new_offset
[
c
];
}
output_ptr
[
pos
]
=
scale_data
[
c
]
*
input_ptr
[
pos
]
+
offset_data
[
c
];
}
}
}
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
aba88a23
...
...
@@ -8,12 +8,141 @@
#include "mace/core/common.h"
#include "mace/core/future.h"
#include "mace/core/public/mace.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace
mace
{
namespace
kernels
{
namespace
{
template
<
typename
T
>
void
DepthwiseConv2dKernel
(
const
T
*
input_ptr
,
const
T
*
filter_ptr
,
const
T
*
bias_ptr
,
T
*
output_ptr
,
int
batch
,
int
height
,
int
width
,
int
channels
,
int
input_height
,
int
input_width
,
int
input_channels
,
int
multiplier
,
int
padded_h_start
,
int
padded_h_stop
,
int
padded_w_start
,
int
padded_w_stop
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
dilation_h
,
int
dilation_w
,
int
h_start
,
int
h_stop
,
int
w_start
,
int
w_stop
)
{
#pragma omp parallel for collapse(4)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
h_start
;
h
<
h_stop
;
++
h
)
{
for
(
int
w
=
w_start
;
w
<
w_stop
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
inc
=
c
/
multiplier
;
const
index_t
m
=
c
%
multiplier
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
<
0
||
inh
>=
input_height
||
inw
<
0
||
inw
>=
input_width
)
{
MACE_CHECK
(
inh
>=
padded_h_start
&&
inh
<
padded_h_stop
&&
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
"Out of range read from input: "
,
inh
,
", "
,
inw
);
}
else
{
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
}
filter_base
+=
input_channels
*
multiplier
;
}
}
output_ptr
[
offset
]
+=
sum
;
}
}
}
}
}
template
<
typename
T
>
void
DepthwiseConv2dNoOOBCheckKernel
(
const
T
*
input_ptr
,
const
T
*
filter_ptr
,
const
T
*
bias_ptr
,
T
*
output_ptr
,
int
batch
,
int
height
,
int
width
,
int
channels
,
int
input_height
,
int
input_width
,
int
input_channels
,
int
multiplier
,
int
padded_h_start
,
int
padded_h_stop
,
int
padded_w_start
,
int
padded_w_stop
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
dilation_h
,
int
dilation_w
,
int
h_start
,
int
h_stop
,
int
w_start
,
int
w_stop
)
{
#pragma omp parallel for collapse(4)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
h_start
;
h
<
h_stop
;
++
h
)
{
for
(
int
w
=
w_start
;
w
<
w_stop
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
inc
=
c
/
multiplier
;
const
index_t
m
=
c
%
multiplier
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
// TODO vectorize this
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
filter_base
+=
input_channels
*
multiplier
;
}
}
output_ptr
[
offset
]
+=
sum
;
}
}
}
}
}
}
// namespace
struct
DepthwiseConv2dFunctorBase
{
DepthwiseConv2dFunctorBase
(
const
int
*
strides
,
const
Padding
padding
,
...
...
@@ -28,7 +157,7 @@ struct DepthwiseConv2dFunctorBase {
relux_max_limit_
(
relux_max_limit
),
prelu_alpha_
(
prelu_alpha
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
strides_
;
// [stride_h, stride_w]
const
Padding
padding_
;
const
int
*
dilations_
;
// [dilation_h, dilation_w]
const
ActivationType
activation_
;
...
...
@@ -88,7 +217,8 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
index_t
multiplier
=
filter
->
dim
(
3
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
MACE_CHECK
(
filter
->
dim
(
2
)
==
input_channels
,
filter
->
dim
(
2
),
"!="
,
input_channels
);
MACE_CHECK
(
channels
==
input_channels
*
multiplier
);
int
stride_h
=
strides_
[
0
];
...
...
@@ -114,43 +244,38 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
const
T
*
bias_ptr
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for collapse(4)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
inc
=
c
/
multiplier
;
const
index_t
m
=
c
%
multiplier
;
T
bias_channel
=
bias_ptr
?
bias_ptr
[
c
]
:
0
;
index_t
offset
=
n
*
height
*
width
*
channels
+
h
*
width
*
channels
+
w
*
channels
+
c
;
output_ptr
[
offset
]
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_base
=
filter_ptr
+
inc
*
multiplier
+
m
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
<
0
||
inh
>=
input_height
||
inw
<
0
||
inw
>=
input_width
)
{
MACE_CHECK
(
inh
>=
padded_h_start
&&
inh
<
padded_h_stop
&&
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
"Out of range read from input: "
,
inh
,
", "
,
inw
);
}
else
{
index_t
input_offset
=
n
*
input_height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
sum
+=
input_ptr
[
input_offset
]
*
filter_base
[
0
];
// HWIM
}
filter_base
+=
input_channels
*
multiplier
;
}
}
output_ptr
[
offset
]
+=
sum
;
}
}
}
}
// Calculate border elements with out-of-boundary checking
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
0
,
1
,
0
,
width
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
height
-
1
,
height
,
0
,
width
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
0
,
1
);
DepthwiseConv2dKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
width
-
1
,
width
);
// Calculate border elements without out-of-boundary checking
DepthwiseConv2dNoOOBCheckKernel
<
T
>
(
input_ptr
,
filter_ptr
,
bias_ptr
,
output_ptr
,
batch
,
height
,
width
,
channels
,
input_height
,
input_width
,
input_channels
,
multiplier
,
padded_h_start
,
padded_h_stop
,
padded_w_start
,
padded_w_stop
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
1
,
height
-
1
,
1
,
width
-
1
);
output_ptr
=
output
->
mutable_data
<
T
>
();
DoActivation
(
output_ptr
,
output_ptr
,
output
->
NumElements
(),
activation_
,
...
...
@@ -180,7 +305,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
dilations
,
activation
,
relux_max_limit
,
prelu_alpha
){}
prelu_alpha
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
mace/kernels/neon/addn_neon.cc
已删除
100644 → 0
浏览文件 @
d394298e
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/addn.h"
#include <arm_neon.h>
namespace
mace
{
namespace
kernels
{
template
<
>
void
AddNFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
// TODO: neon mem copy
output_tensor
->
ResizeLike
(
input_tensors
[
0
]);
index_t
size
=
output_tensor
->
size
();
float
*
output_ptr
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_ptr
,
0
,
size
*
sizeof
(
float
));
int
n
=
input_tensors
.
size
();
int64_t
cost
=
size
*
n
;
int64_t
groups
=
1
;
if
(
cost
>
kCostPerGroup
)
{
groups
=
cost
/
kCostPerGroup
;
}
int64_t
element_per_group
=
size
/
groups
;
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
element_per_group
)
{
int64_t
count
=
std
::
min
(
element_per_group
,
size
-
i
);
int
nn
=
count
>>
2
;
int
remain
=
count
-
(
nn
<<
2
);
for
(
int64_t
j
=
0
;
j
<
n
;
++
j
)
{
const
float
*
input_base
=
input_tensors
[
j
]
->
data
<
float
>
();
const
float
*
inptr
=
input_base
+
i
;
float
*
outptr
=
output_ptr
+
i
;
for
(
int
k
=
0
;
k
<
nn
;
++
k
)
{
float32x4_t
_inptr
=
vld1q_f32
(
inptr
);
float32x4_t
_outptr
=
vld1q_f32
(
outptr
);
_outptr
=
vaddq_f32
(
_outptr
,
_inptr
);
vst1q_f32
(
outptr
,
_outptr
);
inptr
+=
4
;
outptr
+=
4
;
}
for
(
int
k
=
0
;
k
<
remain
;
++
k
)
{
*
outptr
+=
*
inptr
;
++
inptr
;
++
outptr
;
}
}
}
};
}
// namespace kernels
}
// namespace mace
mace/kernels/resize_bilinear.h
浏览文件 @
aba88a23
...
...
@@ -68,12 +68,11 @@ void ResizeImage(const T *images,
const
index_t
out_batch_num_values
=
channels
*
out_height
*
out_width
;
const
CachedInterpolation
*
xs
=
xs_vec
.
data
();
#pragma omp parallel for
#pragma omp parallel for
collapse(2)
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
const
T
*
batch_input_ptr
=
images
+
in_batch_num_values
*
b
;;
T
*
batch_output_ptr
=
output
+
out_batch_num_values
*
b
;
for
(
index_t
y
=
0
;
y
<
out_height
;
++
y
)
{
const
T
*
batch_input_ptr
=
images
+
in_batch_num_values
*
b
;
T
*
batch_output_ptr
=
output
+
out_batch_num_values
*
b
;
const
T
*
y_lower_input_ptr
=
batch_input_ptr
+
ys
[
y
].
lower
*
in_width
*
channels
;
const
T
*
y_upper_input_ptr
=
...
...
mace/ops/addn.cc
浏览文件 @
aba88a23
...
...
@@ -13,14 +13,6 @@ void Register_AddN(OperatorRegistry *op_registry) {
.
Build
(),
AddNOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/addn_benchmark.cc
浏览文件 @
aba88a23
...
...
@@ -67,7 +67,6 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, NEON); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
...
...
mace/ops/addn_test.cc
浏览文件 @
aba88a23
...
...
@@ -33,8 +33,6 @@ void SimpleAdd2() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
CPU
>
();
}
TEST_F
(
AddnOpTest
,
NEONSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
void
SimpleAdd3
()
{
// Construct graph
...
...
@@ -61,8 +59,6 @@ void SimpleAdd3() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
CPU
>
();
}
TEST_F
(
AddnOpTest
,
NEONSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
void
RandomTest
()
{
testing
::
internal
::
LogToStderr
();
...
...
mace/ops/batch_norm_benchmark.cc
浏览文件 @
aba88a23
...
...
@@ -84,7 +84,7 @@ static void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON);
\
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
...
...
mace/ops/batch_norm_test.cc
浏览文件 @
aba88a23
...
...
@@ -72,92 +72,8 @@ void Simple() {
TEST_F
(
BatchNormOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleNEON
)
{
Simple
<
DeviceType
::
NEON
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleRandomNeon
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
+
rand
()
%
10
;
index_t
height
=
64
;
index_t
width
=
64
;
index_t
channels
=
3
+
rand
()
%
50
;
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
.
Input
(
"Input"
)
.
Input
(
"Scale"
)
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
// run cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// Run NEON
net
.
RunOp
(
DeviceType
::
NEON
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-2
);
}
TEST_F
(
BatchNormOpTest
,
ComplexRandomNeon
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
+
rand
()
%
10
;
index_t
channels
=
3
+
rand
()
%
50
;
index_t
height
=
103
;
index_t
width
=
113
;
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
.
Input
(
"Input"
)
.
Input
(
"Scale"
)
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
// run cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// Run NEON
net
.
RunOp
(
DeviceType
::
NEON
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-2
);
}
TEST_F
(
BatchNormOpTest
,
SimpleRandomOPENCL
)
{
srand
(
time
(
NULL
));
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
aba88a23
...
...
@@ -283,7 +283,7 @@ void TestNxNS12(const index_t height, const index_t width) {
for
(
int
kernel_size
:
{
3
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
//
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
...
...
mace/ops/depthwise_conv_2d_benchmark.cc
浏览文件 @
aba88a23
...
...
@@ -95,16 +95,16 @@ static void DepthwiseConv2d(int iters,
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
//
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
//
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
//
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
//
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
//
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
//
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
//
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
//
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
//
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
//
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
SAME
,
1
);
}
//
namespace mace
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录