Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
b649be5f
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b649be5f
编写于
5月 28, 2018
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add MACE_ prefix for macros
上级
871e3392
变更
97
隐藏空白更改
内联
并排
Showing
97 changed file
with
1134 addition
and
1118 deletion
+1134
-1118
mace/core/arg_helper.cc
mace/core/arg_helper.cc
+0
-1
mace/core/operator.h
mace/core/operator.h
+8
-9
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+10
-10
mace/core/runtime/hexagon/hexagon_nn_ops.h
mace/core/runtime/hexagon/hexagon_nn_ops.h
+2
-2
mace/core/tensor.h
mace/core/tensor.h
+6
-6
mace/core/testing/test_benchmark.h
mace/core/testing/test_benchmark.h
+2
-2
mace/core/workspace.cc
mace/core/workspace.cc
+2
-1
mace/kernels/arm/conv_2d_neon_5x5.cc
mace/kernels/arm/conv_2d_neon_5x5.cc
+4
-4
mace/kernels/arm/conv_2d_neon_7x7.cc
mace/kernels/arm/conv_2d_neon_7x7.cc
+16
-16
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+1
-1
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+5
-5
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+3
-2
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+1
-1
mace/ops/activation.cc
mace/ops/activation.cc
+15
-15
mace/ops/activation_benchmark.cc
mace/ops/activation_benchmark.cc
+72
-68
mace/ops/addn.cc
mace/ops/addn.cc
+15
-15
mace/ops/addn_benchmark.cc
mace/ops/addn_benchmark.cc
+15
-13
mace/ops/batch_norm.cc
mace/ops/batch_norm.cc
+15
-15
mace/ops/batch_norm.h
mace/ops/batch_norm.h
+2
-2
mace/ops/batch_norm_benchmark.cc
mace/ops/batch_norm_benchmark.cc
+26
-26
mace/ops/batch_to_space.cc
mace/ops/batch_to_space.cc
+15
-15
mace/ops/batch_to_space.h
mace/ops/batch_to_space.h
+2
-2
mace/ops/batch_to_space_benchmark.cc
mace/ops/batch_to_space_benchmark.cc
+18
-17
mace/ops/bias_add.cc
mace/ops/bias_add.cc
+15
-15
mace/ops/bias_add.h
mace/ops/bias_add.h
+2
-2
mace/ops/bias_add_benchmark.cc
mace/ops/bias_add_benchmark.cc
+26
-26
mace/ops/buffer_to_image.cc
mace/ops/buffer_to_image.cc
+10
-10
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+2
-2
mace/ops/buffer_to_image_benchmark.cc
mace/ops/buffer_to_image_benchmark.cc
+22
-22
mace/ops/channel_shuffle.cc
mace/ops/channel_shuffle.cc
+15
-15
mace/ops/channel_shuffle.h
mace/ops/channel_shuffle.h
+2
-2
mace/ops/channel_shuffle_benchmark.cc
mace/ops/channel_shuffle_benchmark.cc
+18
-17
mace/ops/concat.cc
mace/ops/concat.cc
+15
-15
mace/ops/concat.h
mace/ops/concat.h
+1
-1
mace/ops/concat_benchmark.cc
mace/ops/concat_benchmark.cc
+25
-25
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+15
-15
mace/ops/conv_2d.h
mace/ops/conv_2d.h
+2
-2
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+37
-38
mace/ops/deconv_2d.cc
mace/ops/deconv_2d.cc
+15
-15
mace/ops/deconv_2d.h
mace/ops/deconv_2d.h
+2
-2
mace/ops/deconv_2d_benchmark.cc
mace/ops/deconv_2d_benchmark.cc
+22
-22
mace/ops/depth_to_space.cc
mace/ops/depth_to_space.cc
+15
-15
mace/ops/depth_to_space.h
mace/ops/depth_to_space.h
+2
-2
mace/ops/depth_to_space_benchmark.cc
mace/ops/depth_to_space_benchmark.cc
+18
-17
mace/ops/depthwise_conv2d.cc
mace/ops/depthwise_conv2d.cc
+15
-15
mace/ops/depthwise_conv2d.h
mace/ops/depthwise_conv2d.h
+2
-2
mace/ops/depthwise_conv2d_benchmark.cc
mace/ops/depthwise_conv2d_benchmark.cc
+53
-53
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+15
-15
mace/ops/eltwise.h
mace/ops/eltwise.h
+1
-1
mace/ops/eltwise_benchmark.cc
mace/ops/eltwise_benchmark.cc
+23
-22
mace/ops/folded_batch_norm.cc
mace/ops/folded_batch_norm.cc
+15
-15
mace/ops/folded_batch_norm.h
mace/ops/folded_batch_norm.h
+2
-2
mace/ops/fully_connected.cc
mace/ops/fully_connected.cc
+15
-15
mace/ops/fully_connected.h
mace/ops/fully_connected.h
+2
-2
mace/ops/fully_connected_benchmark.cc
mace/ops/fully_connected_benchmark.cc
+20
-20
mace/ops/image_to_buffer.cc
mace/ops/image_to_buffer.cc
+10
-10
mace/ops/image_to_buffer.h
mace/ops/image_to_buffer.h
+2
-2
mace/ops/local_response_norm.cc
mace/ops/local_response_norm.cc
+5
-5
mace/ops/local_response_norm.h
mace/ops/local_response_norm.h
+2
-2
mace/ops/local_response_norm_benchmark.cc
mace/ops/local_response_norm_benchmark.cc
+21
-19
mace/ops/matmul.cc
mace/ops/matmul.cc
+15
-15
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+14
-13
mace/ops/pad.cc
mace/ops/pad.cc
+15
-15
mace/ops/pad_benchmark.cc
mace/ops/pad_benchmark.cc
+17
-17
mace/ops/pooling.cc
mace/ops/pooling.cc
+15
-15
mace/ops/pooling.h
mace/ops/pooling.h
+2
-2
mace/ops/pooling_benchmark.cc
mace/ops/pooling_benchmark.cc
+20
-20
mace/ops/proposal.cc
mace/ops/proposal.cc
+5
-5
mace/ops/proposal.h
mace/ops/proposal.h
+2
-2
mace/ops/psroi_align.cc
mace/ops/psroi_align.cc
+5
-5
mace/ops/psroi_align.h
mace/ops/psroi_align.h
+2
-2
mace/ops/quantize.cc
mace/ops/quantize.cc
+15
-15
mace/ops/quantize.h
mace/ops/quantize.h
+6
-6
mace/ops/reshape.cc
mace/ops/reshape.cc
+5
-5
mace/ops/reshape.h
mace/ops/reshape.h
+2
-2
mace/ops/resize_bilinear.cc
mace/ops/resize_bilinear.cc
+15
-15
mace/ops/resize_bilinear_benchmark.cc
mace/ops/resize_bilinear_benchmark.cc
+24
-25
mace/ops/slice.cc
mace/ops/slice.cc
+15
-15
mace/ops/slice.h
mace/ops/slice.h
+1
-1
mace/ops/slice_benchmark.cc
mace/ops/slice_benchmark.cc
+14
-12
mace/ops/softmax.cc
mace/ops/softmax.cc
+15
-15
mace/ops/softmax.h
mace/ops/softmax.h
+2
-2
mace/ops/softmax_benchmark.cc
mace/ops/softmax_benchmark.cc
+18
-18
mace/ops/space_to_batch.cc
mace/ops/space_to_batch.cc
+15
-15
mace/ops/space_to_batch.h
mace/ops/space_to_batch.h
+2
-2
mace/ops/space_to_batch_benchmark.cc
mace/ops/space_to_batch_benchmark.cc
+19
-19
mace/ops/space_to_depth.cc
mace/ops/space_to_depth.cc
+15
-15
mace/ops/space_to_depth.h
mace/ops/space_to_depth.h
+2
-2
mace/ops/space_to_depth_benchmark.cc
mace/ops/space_to_depth_benchmark.cc
+18
-17
mace/ops/transpose.cc
mace/ops/transpose.cc
+5
-5
mace/ops/transpose.h
mace/ops/transpose.h
+2
-2
mace/ops/transpose_benchmark.cc
mace/ops/transpose_benchmark.cc
+29
-29
mace/ops/winograd_inverse_transform.cc
mace/ops/winograd_inverse_transform.cc
+10
-10
mace/ops/winograd_inverse_transform.h
mace/ops/winograd_inverse_transform.h
+2
-2
mace/ops/winograd_transform.cc
mace/ops/winograd_transform.cc
+10
-10
mace/ops/winograd_transform.h
mace/ops/winograd_transform.h
+2
-2
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+20
-18
未找到文件。
mace/core/arg_helper.cc
浏览文件 @
b649be5f
...
...
@@ -95,5 +95,4 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC
(
int
,
ints
,
true
)
MACE_GET_REPEATED_ARGUMENT_FUNC
(
int64_t
,
ints
,
true
)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC
}
// namespace mace
mace/core/operator.h
浏览文件 @
b649be5f
...
...
@@ -124,20 +124,19 @@ class Operator : public OperatorBase {
~
Operator
()
noexcept
override
{}
};
// OP_INPUT_TAGS and OP_OUTPUT_TAGS are optional features to name the indices of
// the
// operator's inputs and outputs, in order to avoid confusion. For example, for
// a fully convolution layer that has input, weight and bias, you can define its
// input tags as:
// OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
// indices of the operator's inputs and outputs, in order to avoid confusion.
// For example, for a fully convolution layer that has input, weight and bias,
// you can define its input tags as:
// MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
// And in the code, instead of doing
// auto& weight = Input(1);
// you can now do
// auto& weight = Input(WEIGHT);
// to make it more clear.
#define OP_INPUT_TAGS(first_input, ...) \
#define
MACE_
OP_INPUT_TAGS(first_input, ...) \
enum _InputTags { first_input = 0, __VA_ARGS__ }
#define OP_OUTPUT_TAGS(first_input, ...) \
#define
MACE_
OP_OUTPUT_TAGS(first_input, ...) \
enum _OutputTags { first_input = 0, __VA_ARGS__ }
class
OpKeyBuilder
{
...
...
@@ -186,7 +185,7 @@ MACE_DECLARE_REGISTRY(OpRegistry,
const
OperatorDef
&
,
Workspace
*
);
#define REGISTER_OPERATOR(op_registry, name, ...) \
#define
MACE_
REGISTER_OPERATOR(op_registry, name, ...) \
MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
}
// namespace mace
...
...
mace/core/runtime/hexagon/hexagon_control_wrapper.cc
浏览文件 @
b649be5f
...
...
@@ -32,7 +32,7 @@ inline int64_t NowMicros() {
namespace
mace
{
#define MAX_NODE 2048
#define MA
CE_MA
X_NODE 2048
enum
{
NN_GRAPH_PERFEVENT_CYCLES
=
0
,
...
...
@@ -229,13 +229,13 @@ bool HexagonControlWrapper::TeardownGraph() {
return
hexagon_nn_teardown
(
nn_id_
)
==
0
;
}
#define PRINT_BUFSIZE (2 * 1024 * 1024)
#define
MACE_
PRINT_BUFSIZE (2 * 1024 * 1024)
void
HexagonControlWrapper
::
PrintLog
()
{
char
*
buf
;
if
((
buf
=
new
char
[
PRINT_BUFSIZE
])
==
NULL
)
return
;
if
((
buf
=
new
char
[
MACE_
PRINT_BUFSIZE
])
==
NULL
)
return
;
MACE_CHECK
(
hexagon_nn_getlog
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
),
PRINT_BUFSIZE
)
==
0
,
MACE_
PRINT_BUFSIZE
)
==
0
,
"print log error"
);
LOG
(
INFO
)
<<
std
::
string
(
buf
);
delete
[]
buf
;
...
...
@@ -244,9 +244,9 @@ void HexagonControlWrapper::PrintLog() {
void
HexagonControlWrapper
::
PrintGraph
()
{
LOG
(
INFO
)
<<
"Print Graph"
;
char
*
buf
;
if
((
buf
=
new
char
[
PRINT_BUFSIZE
])
==
NULL
)
return
;
if
((
buf
=
new
char
[
MACE_
PRINT_BUFSIZE
])
==
NULL
)
return
;
MACE_CHECK
(
hexagon_nn_snpprint
(
nn_id_
,
reinterpret_cast
<
unsigned
char
*>
(
buf
),
PRINT_BUFSIZE
)
==
0
,
MACE_
PRINT_BUFSIZE
)
==
0
,
"print graph error"
);
LOG
(
INFO
)
<<
std
::
string
(
buf
);
delete
[]
buf
;
...
...
@@ -265,9 +265,9 @@ void HexagonControlWrapper::SetGraphMode(int mode) {
void
HexagonControlWrapper
::
GetPerfInfo
()
{
LOG
(
INFO
)
<<
"Get perf info"
;
std
::
vector
<
hexagon_nn_perfinfo
>
perf_info
(
MAX_NODE
);
std
::
vector
<
hexagon_nn_perfinfo
>
perf_info
(
MA
CE_MA
X_NODE
);
unsigned
int
n_items
=
0
;
MACE_CHECK
(
hexagon_nn_get_perfinfo
(
nn_id_
,
perf_info
.
data
(),
MAX_NODE
,
MACE_CHECK
(
hexagon_nn_get_perfinfo
(
nn_id_
,
perf_info
.
data
(),
MA
CE_MA
X_NODE
,
&
n_items
)
==
0
,
"get perf info error"
);
...
...
@@ -284,8 +284,8 @@ void HexagonControlWrapper::GetPerfInfo() {
perf_info
[
i
].
counter_lo
)
*
1.0
f
/
perf_info
[
i
].
executions
;
char
node_type_buf
[
MAX_NODE
];
hexagon_nn_op_id_to_name
(
node_type_id
,
node_type_buf
,
MAX_NODE
);
char
node_type_buf
[
MA
CE_MA
X_NODE
];
hexagon_nn_op_id_to_name
(
node_type_id
,
node_type_buf
,
MA
CE_MA
X_NODE
);
std
::
string
node_type
(
node_type_buf
);
LOG
(
INFO
)
<<
"node id: "
<<
perf_info
[
i
].
node_id
<<
", node type: "
<<
node_type
...
...
mace/core/runtime/hexagon/hexagon_nn_ops.h
浏览文件 @
b649be5f
...
...
@@ -22,7 +22,7 @@
namespace
mace
{
#define OP_INVALID -1
#define
MACE_
OP_INVALID -1
typedef
enum
op_type_enum
{
#define DEF_OP(NAME, ...) OP_##NAME,
...
...
@@ -48,7 +48,7 @@ class OpMap {
return
op_map_
[
op_type
];
}
else
{
LOG
(
ERROR
)
<<
"DSP unsupoorted op type: "
<<
op_type
;
return
OP_INVALID
;
return
MACE_
OP_INVALID
;
}
}
...
...
mace/core/tensor.h
浏览文件 @
b649be5f
...
...
@@ -30,9 +30,9 @@
#ifdef MACE_ENABLE_NEON
// Avoid over-bound accessing memory
#define EXTRA_BUFFER_PAD_SIZE 64
#define
MACE_
EXTRA_BUFFER_PAD_SIZE 64
#else
#define EXTRA_BUFFER_PAD_SIZE 0
#define
MACE_
EXTRA_BUFFER_PAD_SIZE 0
#endif
namespace
mace
{
...
...
@@ -210,16 +210,16 @@ class Tensor {
image_shape_
.
clear
();
if
(
buffer_
!=
nullptr
)
{
MACE_CHECK
(
!
has_opencl_image
(),
"Cannot resize image, use ResizeImage."
);
if
(
raw_size
()
+
EXTRA_BUFFER_PAD_SIZE
>
buffer_
->
size
())
{
if
(
raw_size
()
+
MACE_
EXTRA_BUFFER_PAD_SIZE
>
buffer_
->
size
())
{
LOG
(
WARNING
)
<<
"Resize buffer from size "
<<
buffer_
->
size
()
<<
" to "
<<
raw_size
()
+
EXTRA_BUFFER_PAD_SIZE
;
return
buffer_
->
Resize
(
raw_size
()
+
EXTRA_BUFFER_PAD_SIZE
);
<<
raw_size
()
+
MACE_
EXTRA_BUFFER_PAD_SIZE
;
return
buffer_
->
Resize
(
raw_size
()
+
MACE_
EXTRA_BUFFER_PAD_SIZE
);
}
return
MaceStatus
::
MACE_SUCCESS
;
}
else
{
MACE_CHECK
(
is_buffer_owner_
);
buffer_
=
new
Buffer
(
allocator_
);
return
buffer_
->
Allocate
(
raw_size
()
+
EXTRA_BUFFER_PAD_SIZE
);
return
buffer_
->
Allocate
(
raw_size
()
+
MACE_
EXTRA_BUFFER_PAD_SIZE
);
}
}
...
...
mace/core/testing/test_benchmark.h
浏览文件 @
b649be5f
...
...
@@ -21,8 +21,8 @@
#include <vector>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT( \
#define
MACE_
BENCHMARK(n) \
static ::mace::testing::Benchmark *MACE_BENCHMARK_CONCAT(
\
__benchmark_, n, __LINE__) = (new ::mace::testing::Benchmark(#n, (n)))
namespace
mace
{
...
...
mace/core/workspace.cc
浏览文件 @
b649be5f
...
...
@@ -169,7 +169,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
)));
MaceStatus
status
=
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
EXTRA_BUFFER_PAD_SIZE
);
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
);
if
(
status
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
status
;
}
...
...
mace/kernels/arm/conv_2d_neon_5x5.cc
浏览文件 @
b649be5f
...
...
@@ -21,7 +21,7 @@
namespace
mace
{
namespace
kernels
{
#define
Conv2dNeonK5x5SnLoadCalc4
\
#define
MACE_Conv2dNeonK5x5SnLoadCalc4
\
/* load filter (4 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf10, vf20, vf30; \
float32x2_t vf01, vf11, vf21, vf31; \
...
...
@@ -62,7 +62,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1); \
vo3 = vmlaq_lane_f32(vo3, vi4, vf31, 1);
#define
Conv2dNeonK5x5SnLoadCalc1
\
#define
MACE_Conv2dNeonK5x5SnLoadCalc1
\
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00; \
float32x2_t vf01; \
...
...
@@ -138,7 +138,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2
=
vextq_f32
(
vi0
,
vi4
,
2
);
vi3
=
vextq_f32
(
vi0
,
vi4
,
3
);
Conv2dNeonK5x5SnLoadCalc4
;
MACE_
Conv2dNeonK5x5SnLoadCalc4
;
in_offset
+=
in_width
;
filter_ptr0
+=
5
;
...
...
@@ -194,7 +194,7 @@ void Conv2dNeonK5x5S1(const float *input,
vi2
=
vextq_f32
(
vi0
,
vi4
,
2
);
vi3
=
vextq_f32
(
vi0
,
vi4
,
3
);
Conv2dNeonK5x5SnLoadCalc1
;
MACE_
Conv2dNeonK5x5SnLoadCalc1
;
in_offset
+=
in_width
;
filter_ptr0
+=
5
;
...
...
mace/kernels/arm/conv_2d_neon_7x7.cc
浏览文件 @
b649be5f
...
...
@@ -21,7 +21,7 @@
namespace
mace
{
namespace
kernels
{
#define
Conv2dArmv8NeonK7x7SnLoadCalc4
\
#define
MACE_Conv2dArmv8NeonK7x7SnLoadCalc4
\
/* load filter (4 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
...
...
@@ -72,7 +72,7 @@ namespace kernels {
vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2); \
vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
#define
Conv2dArmv8NeonK7x7SnLoadCalc1
\
#define
MACE_Conv2dArmv8NeonK7x7SnLoadCalc1
\
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
...
...
@@ -87,7 +87,7 @@ namespace kernels {
vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2); \
vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
#define
Conv2dArmv7NeonK7x7SnLoadCalc4
\
#define
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4
\
/* load filter (4 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
float32x4_t vf10, vf11; \
...
...
@@ -138,7 +138,7 @@ namespace kernels {
vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0); \
vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
#define
Conv2dArmv7NeonK7x7SnLoadCalc1
\
#define
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1
\
/* load filter (1 outch x 1 height x 4 width) */
\
float32x4_t vf00, vf01; \
vf00 = vld1q_f32(filter_ptr0); \
...
...
@@ -220,9 +220,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6
=
vextq_f32
(
vi4
,
vi8
,
2
);
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc4
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc4
;
#endif
in_offset
+=
in_width
;
...
...
@@ -284,9 +284,9 @@ void Conv2dNeonK7x7S1(const float *input,
vi6
=
vextq_f32
(
vi4
,
vi8
,
2
);
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc1
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc1
;
#endif
in_offset
+=
in_width
;
...
...
@@ -381,9 +381,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6
=
vextq_f32
(
vi0
,
vvi1
.
val
[
0
],
3
);
// [6.8.10.12]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc4
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc4
;
#endif
in_offset
+=
in_width
;
...
...
@@ -450,9 +450,9 @@ void Conv2dNeonK7x7S2(const float *input,
vi6
=
vextq_f32
(
vi0
,
vvi1
.
val
[
0
],
3
);
// [6.8.10.12]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc1
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc1
;
#endif
in_offset
+=
in_width
;
...
...
@@ -547,9 +547,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6
=
vextq_f32
(
vi0
,
vvi1
.
val
[
0
],
2
);
// [6.9.12.15]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc4
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc4
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc4
;
#endif
in_offset
+=
in_width
;
...
...
@@ -616,9 +616,9 @@ void Conv2dNeonK7x7S3(const float *input,
vi6
=
vextq_f32
(
vi0
,
vvi1
.
val
[
0
],
2
);
// [6.9.12.15]
#if defined(__aarch64__)
Conv2dArmv8NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv8NeonK7x7SnLoadCalc1
;
#else
Conv2dArmv7NeonK7x7SnLoadCalc1
;
MACE_
Conv2dArmv7NeonK7x7SnLoadCalc1
;
#endif
in_offset
+=
in_width
;
...
...
mace/kernels/conv_2d.h
浏览文件 @
b649be5f
...
...
@@ -465,7 +465,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
padded_input_size
=
batch
*
input_channels
*
(
input_height
+
pad_top
+
pad_bottom
)
*
(
input_width
+
pad_left
+
pad_right
)
*
sizeof
(
float
)
+
EXTRA_BUFFER_PAD_SIZE
;
MACE_
EXTRA_BUFFER_PAD_SIZE
;
total_scratch_size
+=
padded_input_size
;
}
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
b649be5f
...
...
@@ -314,7 +314,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
// Skip the padded top rows
if
(
padding_same_value
)
{
#define
COPY_INPUT
\
#define
MACE_COPY_INPUT
\
std::fill(output_data, output_data + padded_left, input[0]); \
output_data += padded_left; \
memcpy(output_data, input, width * sizeof(float)); \
...
...
@@ -328,20 +328,20 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
k
=
0
;
k
<
padded_top
;
++
k
)
{
COPY_INPUT
;
MACE_
COPY_INPUT
;
}
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
COPY_INPUT
;
MACE_
COPY_INPUT
;
input
+=
width
;
}
input
-=
width
;
for
(
int
k
=
0
;
k
<
padded_bottom
;
++
k
)
{
COPY_INPUT
;
MACE_
COPY_INPUT
;
}
input
+=
width
;
}
}
#undef COPY_INPUT
#undef
MACE_
COPY_INPUT
}
else
{
output_data
+=
padded_top
*
output_width
;
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
b649be5f
...
...
@@ -43,9 +43,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
MACE_CHECK
(
stride
>
0
,
"stride should > 0."
);
#define WIDTH_BLK 5
#define
MACE_
WIDTH_BLK 5
const
index_t
n_strides
=
(
width
+
stride
-
1
)
/
stride
;
const
index_t
width_blocks
=
((
n_strides
+
WIDTH_BLK
-
1
)
/
WIDTH_BLK
)
*
stride
;
const
index_t
width_blocks
=
((
n_strides
+
MACE_WIDTH_BLK
-
1
)
/
MACE_WIDTH_BLK
)
*
stride
;
const
float
stride_r
=
1.
f
/
static_cast
<
float
>
(
stride
);
const
int
padding_h
=
(
paddings
[
0
]
+
1
)
>>
1
;
const
int
padding_w
=
(
paddings
[
0
]
+
1
)
>>
1
;
...
...
mace/kernels/opencl/out_of_range_check_test.cc
浏览文件 @
b649be5f
...
...
@@ -125,7 +125,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class
OutOfRangeCheckTest
:
public
::
testing
::
Test
{
protected:
virtual
void
SetUp
()
{
setenv
(
"
MACE_
OUT_OF_RANGE_CHECK"
,
"1"
,
1
);
setenv
(
"OUT_OF_RANGE_CHECK"
,
"1"
,
1
);
}
};
...
...
mace/ops/activation.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Activation
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/activation_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -71,25 +71,26 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
}
}
// namespace
#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELU(N, C, H, W) \
BM_RELU_MACRO(N, C, H, W, float, CPU); \
BM_RELU_MACRO(N, C, H, W, float, GPU); \
BM_RELU_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_RELU(N, C, H, W) \
MACE_
BM_RELU_MACRO(N, C, H, W, float, CPU); \
MACE_
BM_RELU_MACRO(N, C, H, W, float, GPU); \
MACE_
BM_RELU_MACRO(N, C, H, W, half, GPU);
BM_RELU
(
1
,
1
,
512
,
512
);
BM_RELU
(
1
,
3
,
128
,
128
);
BM_RELU
(
1
,
3
,
512
,
512
);
BM_RELU
(
1
,
32
,
112
,
112
);
BM_RELU
(
1
,
64
,
256
,
256
);
MACE_
BM_RELU
(
1
,
1
,
512
,
512
);
MACE_
BM_RELU
(
1
,
3
,
128
,
128
);
MACE_
BM_RELU
(
1
,
3
,
512
,
512
);
MACE_
BM_RELU
(
1
,
32
,
112
,
112
);
MACE_
BM_RELU
(
1
,
64
,
256
,
256
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -138,25 +139,26 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
}
}
// namespace
#define BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_RELUX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ReluxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_RELUX(N, C, H, W) \
BM_RELUX_MACRO(N, C, H, W, float, CPU); \
BM_RELUX_MACRO(N, C, H, W, float, GPU); \
BM_RELUX_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_RELUX(N, C, H, W) \
MACE_
BM_RELUX_MACRO(N, C, H, W, float, CPU); \
MACE_
BM_RELUX_MACRO(N, C, H, W, float, GPU); \
MACE_
BM_RELUX_MACRO(N, C, H, W, half, GPU);
BM_RELUX
(
1
,
1
,
512
,
512
);
BM_RELUX
(
1
,
3
,
128
,
128
);
BM_RELUX
(
1
,
3
,
512
,
512
);
BM_RELUX
(
1
,
32
,
112
,
112
);
BM_RELUX
(
1
,
64
,
256
,
256
);
MACE_
BM_RELUX
(
1
,
1
,
512
,
512
);
MACE_
BM_RELUX
(
1
,
3
,
128
,
128
);
MACE_
BM_RELUX
(
1
,
3
,
512
,
512
);
MACE_
BM_RELUX
(
1
,
32
,
112
,
112
);
MACE_
BM_RELUX
(
1
,
64
,
256
,
256
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -212,25 +214,26 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
}
}
// namespace
#define BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_PRELU_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
PreluBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_PRELU(N, C, H, W) \
BM_PRELU_MACRO(N, C, H, W, float, CPU); \
BM_PRELU_MACRO(N, C, H, W, float, GPU); \
BM_PRELU_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_PRELU(N, C, H, W) \
MACE_
BM_PRELU_MACRO(N, C, H, W, float, CPU); \
MACE_
BM_PRELU_MACRO(N, C, H, W, float, GPU); \
MACE_
BM_PRELU_MACRO(N, C, H, W, half, GPU);
BM_PRELU
(
1
,
1
,
512
,
512
);
BM_PRELU
(
1
,
3
,
128
,
128
);
BM_PRELU
(
1
,
3
,
512
,
512
);
BM_PRELU
(
1
,
32
,
112
,
112
);
BM_PRELU
(
1
,
64
,
256
,
256
);
MACE_
BM_PRELU
(
1
,
1
,
512
,
512
);
MACE_
BM_PRELU
(
1
,
3
,
128
,
128
);
MACE_
BM_PRELU
(
1
,
3
,
512
,
512
);
MACE_
BM_PRELU
(
1
,
32
,
112
,
112
);
MACE_
BM_PRELU
(
1
,
64
,
256
,
256
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -277,25 +280,26 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
}
}
// namespace
#define BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_TANH_MACRO(N, C, H, W, TYPE, DEVICE) \
static void MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TanhBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_TANH(N, C, H, W) \
BM_TANH_MACRO(N, C, H, W, float, CPU); \
BM_TANH_MACRO(N, C, H, W, float, GPU); \
BM_TANH_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_TANH(N, C, H, W) \
MACE_
BM_TANH_MACRO(N, C, H, W, float, CPU); \
MACE_
BM_TANH_MACRO(N, C, H, W, float, GPU); \
MACE_
BM_TANH_MACRO(N, C, H, W, half, GPU);
BM_TANH
(
1
,
1
,
512
,
512
);
BM_TANH
(
1
,
3
,
128
,
128
);
BM_TANH
(
1
,
3
,
512
,
512
);
BM_TANH
(
1
,
32
,
112
,
112
);
BM_TANH
(
1
,
64
,
256
,
256
);
MACE_
BM_TANH
(
1
,
1
,
512
,
512
);
MACE_
BM_TANH
(
1
,
3
,
128
,
128
);
MACE_
BM_TANH
(
1
,
3
,
512
,
512
);
MACE_
BM_TANH
(
1
,
32
,
112
,
112
);
MACE_
BM_TANH
(
1
,
64
,
256
,
256
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -343,26 +347,26 @@ void SigmoidBenchmark(
}
}
// namespace
#define BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define
BM_SIGMOID(N, C, H, W)
\
BM_SIGMOID_MACRO(N, C, H, W, float, CPU);
\
BM_SIGMOID_MACRO(N, C, H, W, float, GPU);
\
BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
BM_SIGMOID
(
1
,
1
,
512
,
512
);
BM_SIGMOID
(
1
,
3
,
128
,
128
);
BM_SIGMOID
(
1
,
3
,
512
,
512
);
BM_SIGMOID
(
1
,
32
,
112
,
112
);
BM_SIGMOID
(
1
,
64
,
256
,
256
);
#define
MACE_
BM_SIGMOID_MACRO(N, C, H, W, TYPE, DEVICE) \
static void
MACE_
BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
SigmoidBenchmark<DEVICE, TYPE>(iters, N, C, H, W);
\
}
\
MACE_BENCHMARK(MACE_
BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define
MACE_BM_SIGMOID(N, C, H, W)
\
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU);
\
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU);
\
MACE_
BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
MACE_
BM_SIGMOID
(
1
,
1
,
512
,
512
);
MACE_
BM_SIGMOID
(
1
,
3
,
128
,
128
);
MACE_
BM_SIGMOID
(
1
,
3
,
512
,
512
);
MACE_
BM_SIGMOID
(
1
,
32
,
112
,
112
);
MACE_
BM_SIGMOID
(
1
,
64
,
256
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/addn.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_AddN
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"AddN"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
AddNOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/addn_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -70,26 +70,28 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
}
}
// namespace
#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
#define MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * INPUTS * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C); \
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU);
\
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
#define
MACE_
BM_ADDN(INPUTS, N, H, W, C) \
MACE_
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
MACE_BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU);
\
MACE_
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
BM_ADDN
(
2
,
1
,
256
,
256
,
32
);
BM_ADDN
(
2
,
1
,
128
,
128
,
32
);
BM_ADDN
(
4
,
1
,
128
,
128
,
3
);
BM_ADDN
(
2
,
1
,
256
,
256
,
3
);
BM_ADDN
(
2
,
1
,
512
,
512
,
3
);
MACE_
BM_ADDN
(
2
,
1
,
256
,
256
,
32
);
MACE_
BM_ADDN
(
2
,
1
,
128
,
128
,
32
);
MACE_
BM_ADDN
(
4
,
1
,
128
,
128
,
3
);
MACE_
BM_ADDN
(
2
,
1
,
256
,
256
,
3
);
MACE_
BM_ADDN
(
2
,
1
,
512
,
512
,
3
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/batch_norm.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_BatchNorm
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BatchNormOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/batch_norm.h
浏览文件 @
b649be5f
...
...
@@ -60,8 +60,8 @@ class BatchNormOp : public Operator<D, T> {
kernels
::
BatchNormFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
SCALE
,
OFFSET
,
MEAN
,
VAR
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
SCALE
,
OFFSET
,
MEAN
,
VAR
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/batch_norm_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -95,34 +95,34 @@ void BatchNorm(
}
}
// namespace
#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define
MACE_
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void
MACE_
BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);
\
}
\
MACE_BENCHMARK(MACE_
BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU);
\
BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_BATCH_NORM(N, C, H, W) \
MACE_
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU);
\
MACE_
BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
BM_BATCH_NORM
(
1
,
1
,
512
,
512
);
BM_BATCH_NORM
(
1
,
3
,
128
,
128
);
BM_BATCH_NORM
(
1
,
3
,
512
,
512
);
BM_BATCH_NORM
(
1
,
32
,
112
,
112
);
BM_BATCH_NORM
(
1
,
64
,
256
,
256
);
BM_BATCH_NORM
(
1
,
64
,
512
,
512
);
BM_BATCH_NORM
(
1
,
128
,
56
,
56
);
BM_BATCH_NORM
(
1
,
128
,
256
,
256
);
BM_BATCH_NORM
(
1
,
256
,
14
,
14
);
BM_BATCH_NORM
(
1
,
512
,
14
,
14
);
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
);
BM_BATCH_NORM
(
32
,
1
,
256
,
256
);
BM_BATCH_NORM
(
32
,
3
,
256
,
256
);
MACE_
BM_BATCH_NORM
(
1
,
1
,
512
,
512
);
MACE_
BM_BATCH_NORM
(
1
,
3
,
128
,
128
);
MACE_
BM_BATCH_NORM
(
1
,
3
,
512
,
512
);
MACE_
BM_BATCH_NORM
(
1
,
32
,
112
,
112
);
MACE_
BM_BATCH_NORM
(
1
,
64
,
256
,
256
);
MACE_
BM_BATCH_NORM
(
1
,
64
,
512
,
512
);
MACE_
BM_BATCH_NORM
(
1
,
128
,
56
,
56
);
MACE_
BM_BATCH_NORM
(
1
,
128
,
256
,
256
);
MACE_
BM_BATCH_NORM
(
1
,
256
,
14
,
14
);
MACE_
BM_BATCH_NORM
(
1
,
512
,
14
,
14
);
MACE_
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
);
MACE_
BM_BATCH_NORM
(
32
,
1
,
256
,
256
);
MACE_
BM_BATCH_NORM
(
32
,
3
,
256
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/batch_to_space.cc
浏览文件 @
b649be5f
...
...
@@ -18,22 +18,22 @@ namespace mace {
namespace
ops
{
void
Register_BatchToSpaceND
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BatchToSpaceND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BatchToSpaceNDOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/batch_to_space.h
浏览文件 @
b649be5f
...
...
@@ -44,8 +44,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
kernels
::
SpaceToBatchFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/batch_to_space_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -64,25 +64,26 @@ void BMBatchToSpace(
}
}
// namespace
#define BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \
static void \
BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define MACE_BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, TYPE, DEVICE) \
static void \
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG); \
} \
MACE_BENCHMARK( \
MACE_BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define
BM_BATCH_TO_SPACE(N, H, W, C, ARG)
\
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
#define
MACE_BM_BATCH_TO_SPACE(N, H, W, C, ARG)
\
MACE_
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
MACE_
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
BM_BATCH_TO_SPACE
(
64
,
32
,
32
,
32
,
8
);
MACE_
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
MACE_
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
MACE_
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
MACE_
BM_BATCH_TO_SPACE
(
64
,
32
,
32
,
32
,
8
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/bias_add.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_BiasAdd
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BiasAddOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/bias_add.h
浏览文件 @
b649be5f
...
...
@@ -46,8 +46,8 @@ class BiasAddOp : public Operator<D, T> {
kernels
::
BiasAddFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/bias_add_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -72,34 +72,34 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
}
}
// namespace
#define BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define
MACE_
BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, DEVICE) \
static void
MACE_
BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);
\
}
\
MACE_BENCHMARK(MACE_
BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU);
\
BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_BIAS_ADD(N, C, H, W) \
MACE_
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU);
\
MACE_
BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
BM_BIAS_ADD
(
1
,
1
,
512
,
512
);
BM_BIAS_ADD
(
1
,
3
,
128
,
128
);
BM_BIAS_ADD
(
1
,
3
,
512
,
512
);
BM_BIAS_ADD
(
1
,
32
,
112
,
112
);
BM_BIAS_ADD
(
1
,
64
,
256
,
256
);
BM_BIAS_ADD
(
1
,
64
,
512
,
512
);
BM_BIAS_ADD
(
1
,
128
,
56
,
56
);
BM_BIAS_ADD
(
1
,
128
,
256
,
256
);
BM_BIAS_ADD
(
1
,
256
,
14
,
14
);
BM_BIAS_ADD
(
1
,
512
,
14
,
14
);
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
);
BM_BIAS_ADD
(
32
,
1
,
256
,
256
);
BM_BIAS_ADD
(
32
,
3
,
256
,
256
);
MACE_
BM_BIAS_ADD
(
1
,
1
,
512
,
512
);
MACE_
BM_BIAS_ADD
(
1
,
3
,
128
,
128
);
MACE_
BM_BIAS_ADD
(
1
,
3
,
512
,
512
);
MACE_
BM_BIAS_ADD
(
1
,
32
,
112
,
112
);
MACE_
BM_BIAS_ADD
(
1
,
64
,
256
,
256
);
MACE_
BM_BIAS_ADD
(
1
,
64
,
512
,
512
);
MACE_
BM_BIAS_ADD
(
1
,
128
,
56
,
56
);
MACE_
BM_BIAS_ADD
(
1
,
128
,
256
,
256
);
MACE_
BM_BIAS_ADD
(
1
,
256
,
14
,
14
);
MACE_
BM_BIAS_ADD
(
1
,
512
,
14
,
14
);
MACE_
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
);
MACE_
BM_BIAS_ADD
(
32
,
1
,
256
,
256
);
MACE_
BM_BIAS_ADD
(
32
,
3
,
256
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/buffer_to_image.cc
浏览文件 @
b649be5f
...
...
@@ -18,17 +18,17 @@ namespace mace {
namespace
ops
{
void
Register_BufferToImage
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BufferToImage"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BufferToImageOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BufferToImage"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
BufferToImageOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BufferToImage"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BufferToImageOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BufferToImage"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BufferToImageOp
<
DeviceType
::
GPU
,
half
>
);
}
}
// namespace ops
...
...
mace/ops/buffer_to_image.h
浏览文件 @
b649be5f
...
...
@@ -42,8 +42,8 @@ class BufferToImageOp : public Operator<D, T> {
kernels
::
BufferToImageFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/buffer_to_image_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -54,36 +54,36 @@ void FilterBufferToImage(int iters,
}
}
// namespace
#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \
static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
#define
MACE_
BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \
static void
MACE_
BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W); \
} \
BENCHMARK(
BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_B2I(O, I, H, W) \
BM_B2I_MACRO(O, I, H, W, float, GPU); \
BM_B2I_MACRO(O, I, H, W, half, GPU);
#define
MACE_
BM_B2I(O, I, H, W) \
MACE_
BM_B2I_MACRO(O, I, H, W, float, GPU); \
MACE_
BM_B2I_MACRO(O, I, H, W, half, GPU);
BM_B2I
(
5
,
3
,
3
,
3
);
BM_B2I
(
5
,
3
,
7
,
7
);
BM_B2I
(
32
,
16
,
1
,
1
);
BM_B2I
(
32
,
16
,
3
,
3
);
BM_B2I
(
32
,
16
,
5
,
5
);
BM_B2I
(
32
,
16
,
7
,
7
);
BM_B2I
(
64
,
32
,
1
,
1
);
BM_B2I
(
64
,
32
,
3
,
3
);
BM_B2I
(
64
,
32
,
5
,
5
);
BM_B2I
(
64
,
32
,
7
,
7
);
BM_B2I
(
128
,
64
,
1
,
1
);
BM_B2I
(
128
,
64
,
3
,
3
);
BM_B2I
(
128
,
32
,
1
,
1
);
BM_B2I
(
128
,
32
,
3
,
3
);
BM_B2I
(
256
,
32
,
1
,
1
);
BM_B2I
(
256
,
32
,
3
,
3
);
MACE_
BM_B2I
(
5
,
3
,
3
,
3
);
MACE_
BM_B2I
(
5
,
3
,
7
,
7
);
MACE_
BM_B2I
(
32
,
16
,
1
,
1
);
MACE_
BM_B2I
(
32
,
16
,
3
,
3
);
MACE_
BM_B2I
(
32
,
16
,
5
,
5
);
MACE_
BM_B2I
(
32
,
16
,
7
,
7
);
MACE_
BM_B2I
(
64
,
32
,
1
,
1
);
MACE_
BM_B2I
(
64
,
32
,
3
,
3
);
MACE_
BM_B2I
(
64
,
32
,
5
,
5
);
MACE_
BM_B2I
(
64
,
32
,
7
,
7
);
MACE_
BM_B2I
(
128
,
64
,
1
,
1
);
MACE_
BM_B2I
(
128
,
64
,
3
,
3
);
MACE_
BM_B2I
(
128
,
32
,
1
,
1
);
MACE_
BM_B2I
(
128
,
32
,
3
,
3
);
MACE_
BM_B2I
(
256
,
32
,
1
,
1
);
MACE_
BM_B2I
(
256
,
32
,
3
,
3
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/channel_shuffle.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_ChannelShuffle
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ChannelShuffle"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ChannelShuffleOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/channel_shuffle.h
浏览文件 @
b649be5f
...
...
@@ -50,8 +50,8 @@ class ChannelShuffleOp : public Operator<D, T> {
protected:
const
int
group_
;
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
private:
kernels
::
ChannelShuffleFunctor
<
D
,
T
>
functor_
;
...
...
mace/ops/channel_shuffle_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -69,25 +69,26 @@ void ChannelShuffle(
}
}
// namespace
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU);
\
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
#define
MACE_
BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
MACE_
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU);
\
MACE_
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
BM_CHANNEL_SHUFFLE
(
1
,
64
,
64
,
64
,
8
);
BM_CHANNEL_SHUFFLE
(
1
,
64
,
128
,
128
,
8
);
BM_CHANNEL_SHUFFLE
(
1
,
64
,
256
,
256
,
8
);
MACE_
BM_CHANNEL_SHUFFLE
(
1
,
64
,
64
,
64
,
8
);
MACE_
BM_CHANNEL_SHUFFLE
(
1
,
64
,
128
,
128
,
8
);
MACE_
BM_CHANNEL_SHUFFLE
(
1
,
64
,
256
,
256
,
8
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/concat.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Concat
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Concat"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ConcatOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/concat.h
浏览文件 @
b649be5f
...
...
@@ -51,7 +51,7 @@ class ConcatOp : public Operator<D, T> {
kernels
::
ConcatFunctor
<
D
,
T
>
functor_
;
private:
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/concat_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -52,16 +52,16 @@ void ConcatHelper(int iters, int concat_dim, int dim1) {
}
}
// namespace
#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
} \
BENCHMARK(
BM_CONCAT_CPU_##DIM0##_##DIM1)
#define
MACE_
BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
static void
MACE_
BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) { \
ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1);
\
}
\
MACE_BENCHMARK(MACE_
BM_CONCAT_CPU_##DIM0##_##DIM1)
BM_CONCAT_CPU_MACRO
(
0
,
1000
);
BM_CONCAT_CPU_MACRO
(
0
,
100000
);
BM_CONCAT_CPU_MACRO
(
1
,
1000
);
BM_CONCAT_CPU_MACRO
(
1
,
100000
);
MACE_
BM_CONCAT_CPU_MACRO
(
0
,
1000
);
MACE_
BM_CONCAT_CPU_MACRO
(
0
,
100000
);
MACE_
BM_CONCAT_CPU_MACRO
(
1
,
1000
);
MACE_
BM_CONCAT_CPU_MACRO
(
1
,
100000
);
namespace
{
template
<
typename
T
>
...
...
@@ -106,22 +106,22 @@ void OpenclConcatHelper(int iters,
}
}
// namespace
#define
BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)
\
static void
BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {
\
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
BENCHMARK(
BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
32
,
float
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
64
,
float
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
128
,
float
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
256
,
float
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
32
,
half
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
64
,
half
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
128
,
half
);
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
256
,
half
);
#define
MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)
\
static void
MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {
\
std::vector<index_t> shape = {N, H, W, C};
\
OpenclConcatHelper<TYPE>(iters, shape, shape, 3);
\
}
\
MACE_BENCHMARK(MACE_
BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
32
,
float
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
64
,
float
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
128
,
float
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
256
,
float
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
32
,
half
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
64
,
half
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
128
,
half
);
MACE_
BM_CONCAT_OPENCL_MACRO
(
3
,
32
,
32
,
256
,
half
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/conv_2d.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Conv2D
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/conv_2d.h
浏览文件 @
b649be5f
...
...
@@ -54,8 +54,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
kernels
::
Conv2dFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -105,11 +105,11 @@ void Conv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE,
\
DEVICE)
\
#define
MACE_BM_CONV_2D_MACRO(
\
N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, DEVICE)
\
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION
\
##_##P##_##OC##_##TYPE##_##DEVICE(
\
MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##
\
DILATION##_##P##_##OC##_##TYPE##_##DEVICE(
\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
...
...
@@ -128,54 +128,53 @@ void Conv2d(int iters,
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK(
\
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION
\
##_##P##_##OC##_##TYPE##_##DEVICE)
MACE_BENCHMARK(
\
MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##
\
DILATION
##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU);
\
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
#define
MACE_
BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU);
\
MACE_
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);
// Filter sizes and data alignments
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
15
,
1
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
1
,
15
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
2
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
3
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
1
,
VALID
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
1
,
VALID
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
15
,
1
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
1
,
15
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
2
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
64
,
32
,
31
,
7
,
7
,
3
,
1
,
SAME
,
128
);
// 3 channels input
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
1
,
VALID
,
3
);
BM_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
1
,
SAME
,
32
);
BM_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
1
,
VALID
,
32
);
MACE_
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
1
,
VALID
,
3
);
MACE_
BM_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
1
,
SAME
,
32
);
MACE_
BM_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
1
,
VALID
,
32
);
// Dilations
BM_CONV_2D
(
1
,
32
,
256
,
256
,
3
,
3
,
1
,
2
,
VALID
,
32
);
BM_CONV_2D
(
1
,
32
,
256
,
256
,
3
,
3
,
1
,
4
,
VALID
,
32
);
MACE_
BM_CONV_2D
(
1
,
32
,
256
,
256
,
3
,
3
,
1
,
2
,
VALID
,
32
);
MACE_
BM_CONV_2D
(
1
,
32
,
256
,
256
,
3
,
3
,
1
,
4
,
VALID
,
32
);
// MobileNet
BM_CONV_2D
(
1
,
128
,
56
,
56
,
1
,
1
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
1
,
SAME
,
1024
);
MACE_
BM_CONV_2D
(
1
,
128
,
56
,
56
,
1
,
1
,
1
,
1
,
SAME
,
128
);
MACE_
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
1
,
SAME
,
1024
);
BM_CONV_2D
(
64
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
MACE_
BM_CONV_2D
(
64
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
MACE_
BM_CONV_2D
(
1
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
192
,
17
,
17
,
1
,
7
,
1
,
1
,
SAME
,
192
);
BM_CONV_2D
(
1
,
192
,
17
,
17
,
7
,
1
,
1
,
1
,
SAME
,
192
);
BM_CONV_2D
(
1
,
160
,
17
,
17
,
7
,
1
,
1
,
1
,
SAME
,
192
);
BM_CONV_2D
(
1
,
32
,
256
,
256
,
1
,
15
,
1
,
1
,
SAME
,
2
);
BM_CONV_2D
(
1
,
32
,
256
,
256
,
15
,
1
,
1
,
1
,
SAME
,
2
);
BM_CONV_2D
(
1
,
64
,
64
,
64
,
15
,
1
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
192
,
17
,
17
,
1
,
7
,
1
,
1
,
SAME
,
192
);
MACE_BM_CONV_2D
(
1
,
192
,
17
,
17
,
7
,
1
,
1
,
1
,
SAME
,
192
);
MACE_BM_CONV_2D
(
1
,
160
,
17
,
17
,
7
,
1
,
1
,
1
,
SAME
,
192
);
MACE_BM_CONV_2D
(
1
,
32
,
256
,
256
,
1
,
15
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
32
,
256
,
256
,
15
,
1
,
1
,
1
,
SAME
,
2
);
MACE_BM_CONV_2D
(
1
,
64
,
64
,
64
,
15
,
1
,
1
,
1
,
SAME
,
2
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/deconv_2d.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Deconv2D
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Deconv2D"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Deconv2dOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/deconv_2d.h
浏览文件 @
b649be5f
...
...
@@ -49,8 +49,8 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> {
kernels
::
Deconv2dFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/deconv_2d_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -94,11 +94,11 @@ static void Deconv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE,
\
DEVICE)
\
#define
MACE_BM_DECONV_2D_MACRO(
\
N, C, H, W, KH, KW, STRIDE, OH, OW, P, OC, TYPE, DEVICE)
\
static void \
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW
\
##
_##P##_##OC##_##TYPE##_##DEVICE(
\
MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_
\
##
OW##_##P##_##OC##_##TYPE##_##DEVICE(
\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t oh = OH; \
...
...
@@ -110,30 +110,30 @@ static void Deconv2d(int iters,
Deconv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, OH, OW, \
mace::Padding::P, OC); \
} \
BENCHMARK(
\
BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_##OW##
\
_##P##_##OC##_##TYPE##_##DEVICE)
MACE_BENCHMARK(
\
MACE_BM_DECONV_2D_##N##_##C##_##H##_##W##_##KH##_##KW##_##STRIDE##_##OH##_
\
##OW##
_##P##_##OC##_##TYPE##_##DEVICE)
// TODO(liutuo): add cpu benchmark when optimized.
#define
BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC)
\
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
#define
MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC)
\
MACE_
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
MACE_
BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
BM_DECONV_2D
(
1
,
128
,
15
,
15
,
1
,
1
,
1
,
15
,
15
,
VALID
,
256
);
BM_DECONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
60
,
60
,
VALID
,
128
);
MACE_
BM_DECONV_2D
(
1
,
128
,
15
,
15
,
1
,
1
,
1
,
15
,
15
,
VALID
,
256
);
MACE_
BM_DECONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
60
,
60
,
VALID
,
128
);
BM_DECONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
62
,
62
,
VALID
,
128
);
BM_DECONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
60
,
60
,
SAME
,
32
);
BM_DECONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
1023
,
1023
,
SAME
,
32
);
BM_DECONV_2D
(
1
,
128
,
16
,
16
,
5
,
5
,
1
,
20
,
20
,
VALID
,
32
);
BM_DECONV_2D
(
1
,
128
,
64
,
64
,
5
,
5
,
1
,
68
,
68
,
VALID
,
32
);
MACE_
BM_DECONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
62
,
62
,
VALID
,
128
);
MACE_
BM_DECONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
60
,
60
,
SAME
,
32
);
MACE_
BM_DECONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
1023
,
1023
,
SAME
,
32
);
MACE_
BM_DECONV_2D
(
1
,
128
,
16
,
16
,
5
,
5
,
1
,
20
,
20
,
VALID
,
32
);
MACE_
BM_DECONV_2D
(
1
,
128
,
64
,
64
,
5
,
5
,
1
,
68
,
68
,
VALID
,
32
);
BM_DECONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
480
,
480
,
VALID
,
3
);
MACE_
BM_DECONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
480
,
480
,
VALID
,
3
);
BM_DECONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
32
,
32
,
VALID
,
128
);
BM_DECONV_2D
(
1
,
64
,
33
,
32
,
3
,
3
,
2
,
65
,
63
,
SAME
,
128
);
BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
447
,
447
,
SAME
,
32
);
BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
449
,
449
,
VALID
,
32
);
MACE_
BM_DECONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
32
,
32
,
VALID
,
128
);
MACE_
BM_DECONV_2D
(
1
,
64
,
33
,
32
,
3
,
3
,
2
,
65
,
63
,
SAME
,
128
);
MACE_
BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
447
,
447
,
SAME
,
32
);
MACE_
BM_DECONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
449
,
449
,
VALID
,
32
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/depth_to_space.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_DepthToSpace
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthToSpace"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
DepthToSpaceOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/depth_to_space.h
浏览文件 @
b649be5f
...
...
@@ -55,8 +55,8 @@ class DepthToSpaceOp : public Operator<D, T> {
protected:
const
int
block_size_
;
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
private:
kernels
::
DepthToSpaceOpFunctor
<
D
,
T
>
functor_
;
...
...
mace/ops/depth_to_space_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -69,25 +69,26 @@ void DepthToSpace(
}
}
// namespace
#define BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthToSpace<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_DEPTH_TO_SPACE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_DEPTH_TO_SPACE(N, C, H, W, G) \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU);
\
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
#define
MACE_
BM_DEPTH_TO_SPACE(N, C, H, W, G) \
MACE_
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU);
\
MACE_
BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
BM_DEPTH_TO_SPACE
(
1
,
64
,
64
,
64
,
4
);
BM_DEPTH_TO_SPACE
(
1
,
64
,
128
,
128
,
4
);
BM_DEPTH_TO_SPACE
(
1
,
64
,
256
,
256
,
4
);
MACE_
BM_DEPTH_TO_SPACE
(
1
,
64
,
64
,
64
,
4
);
MACE_
BM_DEPTH_TO_SPACE
(
1
,
64
,
128
,
128
,
4
);
MACE_
BM_DEPTH_TO_SPACE
(
1
,
64
,
256
,
256
,
4
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/depthwise_conv2d.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_DepthwiseConv2d
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/depthwise_conv2d.h
浏览文件 @
b649be5f
...
...
@@ -55,8 +55,8 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
kernels
::
DepthwiseConv2dFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
FILTER
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/depthwise_conv2d_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -101,61 +101,61 @@ void DepthwiseConv2d(int iters,
}
}
// namespace
#define
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE,
\
DEVICE)
\
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_
\
##
P##_##M##_##TYPE##_##DEVICE(
\
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK(
\
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_
\
##P##_##M##_##TYPE##_##DEVICE)
#define
MACE_BM_DEPTHWISE_CONV_2D_MACRO(
\
N, C, H, W, KH, KW, STRIDE, P, M, TYPE, DEVICE)
\
static void
\
MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE
\
##
_##P##_##M##_##TYPE##_##DEVICE(
\
int iters) {
\
const int64_t dilation = 1;
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
int64_t pad_h = 0, pad_w = 0;
\
if (P == SAME) {
\
pad_h = KH / 2;
\
pad_w = KW / 2;
\
}
\
int64_t oh =
\
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1;
\
int64_t ow =
\
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1;
\
const int64_t macc =
\
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1);
\
mace::testing::MaccProcessed(macc);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE,
\
mace::Padding::P, M);
\
}
\
MACE_BENCHMARK(
\
MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE
\
##
_##
P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU);
\
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
#define
MACE_
BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU);
\
MACE_
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
56
,
56
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
224
,
224
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
56
,
56
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
224
,
224
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
SAME
,
1
);
BM_DEPTHWISE_CONV_2D
(
1
,
8
,
224
,
224
,
3
,
3
,
2
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
1
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
56
,
56
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
32
,
224
,
224
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
56
,
56
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
224
,
224
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
112
,
112
,
3
,
3
,
2
,
VALID
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
3
,
224
,
224
,
3
,
3
,
2
,
SAME
,
1
);
MACE_
BM_DEPTHWISE_CONV_2D
(
1
,
8
,
224
,
224
,
3
,
3
,
2
,
SAME
,
1
);
}
// namespace test
...
...
mace/ops/eltwise.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Eltwise
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/eltwise.h
浏览文件 @
b649be5f
...
...
@@ -43,7 +43,7 @@ class EltwiseOp : public Operator<D, T> {
kernels
::
EltwiseFunctor
<
D
,
T
>
functor_
;
private:
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/eltwise_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -76,30 +76,31 @@ void EltwiseBenchmark(
}
}
// namespace
#define BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \
static void \
BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
} \
BENCHMARK(BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
EltwiseBenchmark<DEVICE, TYPE>( \
iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
} \
MACE_BENCHMARK( \
MACE_BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ELTWISE(ELT_TYPE, N, H, W, C) \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU);
\
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
#define
MACE_
BM_ELTWISE(ELT_TYPE, N, H, W, C) \
MACE_
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU); \
MACE_BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU);
\
MACE_
BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
BM_ELTWISE
(
2
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
2
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
2
,
1
,
256
,
256
,
32
);
BM_ELTWISE
(
0
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
0
,
1
,
240
,
240
,
256
);
BM_ELTWISE
(
5
,
1
,
128
,
128
,
32
);
BM_ELTWISE
(
5
,
1
,
240
,
240
,
256
);
MACE_
BM_ELTWISE
(
2
,
1
,
128
,
128
,
32
);
MACE_
BM_ELTWISE
(
2
,
1
,
240
,
240
,
256
);
MACE_
BM_ELTWISE
(
2
,
1
,
256
,
256
,
32
);
MACE_
BM_ELTWISE
(
0
,
1
,
128
,
128
,
32
);
MACE_
BM_ELTWISE
(
0
,
1
,
240
,
240
,
256
);
MACE_
BM_ELTWISE
(
5
,
1
,
128
,
128
,
32
);
MACE_
BM_ELTWISE
(
5
,
1
,
240
,
240
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/folded_batch_norm.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_FoldedBatchNorm
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/folded_batch_norm.h
浏览文件 @
b649be5f
...
...
@@ -56,8 +56,8 @@ class FoldedBatchNormOp : public Operator<D, T> {
kernels
::
BatchNormFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
SCALE
,
OFFSET
,
MEAN
,
VAR
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
SCALE
,
OFFSET
,
MEAN
,
VAR
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/fully_connected.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_FullyConnected
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/fully_connected.h
浏览文件 @
b649be5f
...
...
@@ -72,8 +72,8 @@ class FullyConnectedOp : public Operator<D, T> {
kernels
::
FullyConnectedFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
WEIGHT
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
WEIGHT
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/fully_connected_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -82,28 +82,28 @@ void FCBenchmark(
}
}
// namespace
#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * H * W * OC + OC; \
const int64_t tot = \
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC); \
} \
BENCHMARK(
BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define
MACE_
BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
static void
MACE_
BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t macc =
\
static_cast<int64_t>(iters) * N * C * H * W * OC + OC;
\
const int64_t tot =
\
static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC;
\
mace::testing::MaccProcessed(macc);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);
\
}
\
MACE_BENCHMARK(MACE_
BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
#define BM_FC(N, H, W, C, OC) \
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
BM_FC_MACRO(N, H, W, C, OC, float, GPU);
\
BM_FC_MACRO(N, H, W, C, OC, half, GPU);
#define
MACE_
BM_FC(N, H, W, C, OC) \
MACE_
BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU);
\
MACE_
BM_FC_MACRO(N, H, W, C, OC, half, GPU);
BM_FC
(
1
,
16
,
16
,
32
,
32
);
BM_FC
(
1
,
8
,
8
,
32
,
1000
);
BM_FC
(
1
,
2
,
2
,
512
,
2
);
BM_FC
(
1
,
7
,
7
,
512
,
2048
);
MACE_
BM_FC
(
1
,
16
,
16
,
32
,
32
);
MACE_
BM_FC
(
1
,
8
,
8
,
32
,
1000
);
MACE_
BM_FC
(
1
,
2
,
2
,
512
,
2
);
MACE_
BM_FC
(
1
,
7
,
7
,
512
,
2048
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/image_to_buffer.cc
浏览文件 @
b649be5f
...
...
@@ -18,17 +18,17 @@ namespace mace {
namespace
ops
{
void
Register_ImageToBuffer
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ImageToBuffer"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ImageToBufferOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ImageToBuffer"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ImageToBufferOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ImageToBuffer"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ImageToBufferOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ImageToBuffer"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ImageToBufferOp
<
DeviceType
::
GPU
,
half
>
);
}
}
// namespace ops
...
...
mace/ops/image_to_buffer.h
浏览文件 @
b649be5f
...
...
@@ -41,8 +41,8 @@ class ImageToBufferOp : public Operator<D, T> {
kernels
::
ImageToBufferFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/local_response_norm.cc
浏览文件 @
b649be5f
...
...
@@ -18,11 +18,11 @@ namespace mace {
namespace
ops
{
void
Register_LocalResponseNorm
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"LocalResponseNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
LocalResponseNormOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"LocalResponseNorm"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
LocalResponseNormOp
<
DeviceType
::
CPU
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/local_response_norm.h
浏览文件 @
b649be5f
...
...
@@ -53,8 +53,8 @@ class LocalResponseNormOp : public Operator<D, T> {
kernels
::
LocalResponseNormFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/local_response_norm_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -55,32 +55,34 @@ static void LocalResponseNorm(
net
.
Sync
();
}
#define BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(\
int iters) { \
#define MACE_BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
static void \
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
LocalResponseNorm<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_LOCAL_RESPONSE_NORM(N, C, H, W) \
BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);
#define
MACE_
BM_LOCAL_RESPONSE_NORM(N, C, H, W) \
MACE_
BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);
BM_LOCAL_RESPONSE_NORM
(
1
,
1
,
512
,
512
);
BM_LOCAL_RESPONSE_NORM
(
1
,
3
,
128
,
128
);
BM_LOCAL_RESPONSE_NORM
(
1
,
3
,
512
,
512
);
BM_LOCAL_RESPONSE_NORM
(
1
,
32
,
112
,
112
);
BM_LOCAL_RESPONSE_NORM
(
1
,
64
,
256
,
256
);
BM_LOCAL_RESPONSE_NORM
(
1
,
64
,
512
,
512
);
BM_LOCAL_RESPONSE_NORM
(
1
,
128
,
56
,
56
);
BM_LOCAL_RESPONSE_NORM
(
1
,
128
,
256
,
256
);
BM_LOCAL_RESPONSE_NORM
(
1
,
256
,
14
,
14
);
BM_LOCAL_RESPONSE_NORM
(
1
,
512
,
14
,
14
);
BM_LOCAL_RESPONSE_NORM
(
1
,
1024
,
7
,
7
);
BM_LOCAL_RESPONSE_NORM
(
32
,
1
,
256
,
256
);
BM_LOCAL_RESPONSE_NORM
(
32
,
3
,
256
,
256
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
1
,
512
,
512
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
3
,
128
,
128
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
3
,
512
,
512
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
32
,
112
,
112
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
64
,
256
,
256
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
64
,
512
,
512
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
128
,
56
,
56
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
128
,
256
,
256
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
256
,
14
,
14
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
512
,
14
,
14
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
1
,
1024
,
7
,
7
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
32
,
1
,
256
,
256
);
MACE_
BM_LOCAL_RESPONSE_NORM
(
32
,
3
,
256
,
256
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/matmul.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_MatMul
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"MatMul"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
MatMulOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/matmul_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -67,27 +67,28 @@ void MatMulBenchmark(
}
}
// namespace
#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \
#define MACE_BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \
static void MACE_BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * (C * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
MatMulBenchmark<DEVICE, TYPE>(iters, N, H, C, W); \
} \
BENCHMARK(
BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
MACE_BENCHMARK(MACE_
BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W) \
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
BM_MATMUL_MACRO(N, H, C, W, float, GPU);
\
BM_MATMUL_MACRO(N, H, C, W, half, GPU);
#define
MACE_
BM_MATMUL(N, H, C, W) \
MACE_
BM_MATMUL_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU);
\
MACE_
BM_MATMUL_MACRO(N, H, C, W, half, GPU);
BM_MATMUL
(
16
,
32
,
128
,
49
);
BM_MATMUL
(
16
,
32
,
128
,
961
);
BM_MATMUL
(
16
,
32
,
128
,
3969
);
BM_MATMUL
(
16
,
128
,
128
,
49
);
BM_MATMUL
(
16
,
128
,
128
,
961
);
BM_MATMUL
(
16
,
128
,
128
,
3969
);
MACE_
BM_MATMUL
(
16
,
32
,
128
,
49
);
MACE_
BM_MATMUL
(
16
,
32
,
128
,
961
);
MACE_
BM_MATMUL
(
16
,
32
,
128
,
3969
);
MACE_
BM_MATMUL
(
16
,
128
,
128
,
49
);
MACE_
BM_MATMUL
(
16
,
128
,
128
,
961
);
MACE_
BM_MATMUL
(
16
,
128
,
128
,
3969
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/pad.cc
浏览文件 @
b649be5f
...
...
@@ -18,23 +18,23 @@ namespace mace {
namespace
ops
{
void
Register_Pad
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pad"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
PadOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/pad_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -65,25 +65,25 @@ void Pad(int iters, int batch, int height,
}
}
// namespace
#define
BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE)
\
static void BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD); \
} \
BENCHMARK(
BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE)
#define
MACE_BM_PAD_MACRO(N, H, W, C, PAD, TYPE, DEVICE)
\
static void
MACE_
BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
Pad<DEVICE, TYPE>(iters, N, H, W, C, PAD);
\
}
\
MACE_BENCHMARK(MACE_
BM_PAD_##N##_##H##_##W##_##C##_##PAD##_##TYPE##_##DEVICE)
#define BM_PAD(N, H, W, C, PAD) \
BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
BM_PAD_MACRO(N, H, W, C, PAD, float, GPU);
\
BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
#define
MACE_
BM_PAD(N, H, W, C, PAD) \
MACE_
BM_PAD_MACRO(N, H, W, C, PAD, float, CPU); \
MACE_BM_PAD_MACRO(N, H, W, C, PAD, float, GPU);
\
MACE_
BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);
BM_PAD
(
1
,
512
,
512
,
1
,
2
);
BM_PAD
(
1
,
112
,
112
,
64
,
1
);
BM_PAD
(
1
,
256
,
256
,
32
,
2
);
BM_PAD
(
1
,
512
,
512
,
16
,
2
);
MACE_
BM_PAD
(
1
,
512
,
512
,
1
,
2
);
MACE_
BM_PAD
(
1
,
112
,
112
,
64
,
1
);
MACE_
BM_PAD
(
1
,
256
,
256
,
32
,
2
);
MACE_
BM_PAD
(
1
,
512
,
512
,
16
,
2
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/pooling.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Pooling
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/pooling.h
浏览文件 @
b649be5f
...
...
@@ -52,8 +52,8 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
PoolingType
pooling_type_
;
kernels
::
PoolingFunctor
<
D
,
T
>
functor_
;
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/pooling_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -87,29 +87,29 @@ void Pooling(int iters,
}
}
// namespace
#define BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \
static void \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(float))); \
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA, \
PoolingType::PO); \
} \
BENCHMARK( \
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
#define
MACE_
BM_POOLING_MACRO(N, C, H, W, KE, STRIDE, PA, PO, DEVICE) \
static void
\
MACE_
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE(
\
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(float)));
\
Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,
\
PoolingType::PO);
\
}
\
MACE_
BENCHMARK( \
MACE_
BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_\
##DEVICE)
#define
BM_POOLING(N, C, H, W, K, S, PA, PO)
\
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
#define
MACE_BM_POOLING(N, C, H, W, K, S, PA, PO)
\
MACE_
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
MACE_
BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);
BM_POOLING
(
1
,
3
,
129
,
129
,
2
,
2
,
SAME
,
MAX
);
BM_POOLING
(
1
,
3
,
257
,
257
,
2
,
2
,
SAME
,
MAX
);
BM_POOLING
(
1
,
3
,
513
,
513
,
2
,
2
,
SAME
,
MAX
);
BM_POOLING
(
1
,
3
,
1025
,
1025
,
2
,
2
,
SAME
,
MAX
);
MACE_
BM_POOLING
(
1
,
3
,
129
,
129
,
2
,
2
,
SAME
,
MAX
);
MACE_
BM_POOLING
(
1
,
3
,
257
,
257
,
2
,
2
,
SAME
,
MAX
);
MACE_
BM_POOLING
(
1
,
3
,
513
,
513
,
2
,
2
,
SAME
,
MAX
);
MACE_
BM_POOLING
(
1
,
3
,
1025
,
1025
,
2
,
2
,
SAME
,
MAX
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/proposal.cc
浏览文件 @
b649be5f
...
...
@@ -18,11 +18,11 @@ namespace mace {
namespace
ops
{
void
Register_Proposal
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Proposal"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ProposalOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Proposal"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ProposalOp
<
DeviceType
::
CPU
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/proposal.h
浏览文件 @
b649be5f
...
...
@@ -49,8 +49,8 @@ class ProposalOp : public Operator<D, T> {
kernels
::
ProposalFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
RPN_CLS_PROB
,
RPN_BBOX_PRED
,
IMG_INFO
);
OP_OUTPUT_TAGS
(
ROIS
);
MACE_
OP_INPUT_TAGS
(
RPN_CLS_PROB
,
RPN_BBOX_PRED
,
IMG_INFO
);
MACE_
OP_OUTPUT_TAGS
(
ROIS
);
};
}
// namespace ops
...
...
mace/ops/psroi_align.cc
浏览文件 @
b649be5f
...
...
@@ -18,11 +18,11 @@ namespace mace {
namespace
ops
{
void
Register_PSROIAlign
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"PSROIAlign"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PSROIAlignOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"PSROIAlign"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PSROIAlignOp
<
DeviceType
::
CPU
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/psroi_align.h
浏览文件 @
b649be5f
...
...
@@ -43,8 +43,8 @@ class PSROIAlignOp : public Operator<D, T> {
kernels
::
PSROIAlignFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
ROIS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
ROIS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/quantize.cc
浏览文件 @
b649be5f
...
...
@@ -18,27 +18,27 @@ namespace mace {
namespace
ops
{
void
Register_Quantize
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Quantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
QuantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Quantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
QuantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
}
void
Register_Dequantize
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Dequantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
DequantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Dequantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
DequantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
}
void
Register_Requantize
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Requantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
RequantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Requantize"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
RequantizeOp
<
DeviceType
::
CPU
,
uint8_t
>
);
}
}
// namespace ops
...
...
mace/ops/quantize.h
浏览文件 @
b649be5f
...
...
@@ -50,8 +50,8 @@ class QuantizeOp : public Operator<D, T> {
kernels
::
QuantizeFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
);
OP_OUTPUT_TAGS
(
OUTPUT
,
OUT_MIN
,
OUT_MAX
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
,
OUT_MIN
,
OUT_MAX
);
};
template
<
DeviceType
D
,
class
T
>
...
...
@@ -79,8 +79,8 @@ class DequantizeOp : public Operator<D, T> {
kernels
::
DequantizeFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
template
<
DeviceType
D
,
class
T
>
...
...
@@ -131,8 +131,8 @@ class RequantizeOp : public Operator<D, T> {
kernels
::
RequantizeFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
,
RERANGE_MIN
,
RERANGE_MAX
);
OP_OUTPUT_TAGS
(
OUTPUT
,
OUT_MIN
,
OUT_MAX
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
IN_MIN
,
IN_MAX
,
RERANGE_MIN
,
RERANGE_MAX
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
,
OUT_MIN
,
OUT_MAX
);
};
}
// namespace ops
...
...
mace/ops/reshape.cc
浏览文件 @
b649be5f
...
...
@@ -18,11 +18,11 @@ namespace mace {
namespace
ops
{
void
Register_Reshape
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Reshape"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ReshapeOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Reshape"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ReshapeOp
<
DeviceType
::
CPU
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/reshape.h
浏览文件 @
b649be5f
...
...
@@ -69,8 +69,8 @@ class ReshapeOp : public Operator<D, T> {
kernels
::
ReshapeFunctor
<
D
,
T
>
functor_
;
private:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/resize_bilinear.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_ResizeBilinear
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/resize_bilinear_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -82,34 +82,33 @@ void ResizeBilinearBenchmark(int iters,
}
}
// namespace
#define
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE)
\
static void \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0; \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1); \
} \
BENCHMARK(
\
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
#define
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, DEVICE)
\
static void
\
MACE_
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE(
\
int iters) {
\
const int64_t macc = static_cast<int64_t>(iters) * N * C * H1 * W1 * 3;
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;
\
mace::testing::MaccProcessed(macc);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
ResizeBilinearBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);
\
}
\
MACE_BENCHMARK(
\
MACE_
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_\
##DEVICE)
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU);
\
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
#define
MACE_
BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
MACE_
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU);
\
MACE_
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
);
BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
);
BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
);
BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
);
MACE_BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
);
MACE_BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
);
MACE_BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
);
MACE_BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
);
MACE_BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
);
MACE_BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
);
MACE_BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/slice.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Slice
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Slice"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SliceOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/slice.h
浏览文件 @
b649be5f
...
...
@@ -46,7 +46,7 @@ class SliceOp : public Operator<D, T> {
kernels
::
SliceFunctor
<
D
,
T
>
functor_
;
private:
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
};
}
// namespace ops
...
...
mace/ops/slice_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -73,26 +73,28 @@ void BMSliceHelper(int iters,
}
}
// namespace
#define
BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE)
\
#define
MACE_BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE)
\
static void \
BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \
MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO); \
} \
BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
#define BM_SLICE(N, H, W, C, NO) \
BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
BM_SLICE_MACRO(N, H, W, C, NO, float, GPU);
\
BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
#define
MACE_
BM_SLICE(N, H, W, C, NO) \
MACE_
BM_SLICE_MACRO(N, H, W, C, NO, float, CPU); \
MACE_BM_SLICE_MACRO(N, H, W, C, NO, float, GPU);
\
MACE_
BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);
BM_SLICE
(
1
,
32
,
32
,
32
,
2
);
BM_SLICE
(
1
,
32
,
32
,
128
,
2
);
BM_SLICE
(
1
,
32
,
32
,
256
,
2
);
BM_SLICE
(
1
,
128
,
128
,
32
,
2
);
BM_SLICE
(
1
,
128
,
128
,
128
,
2
);
MACE_
BM_SLICE
(
1
,
32
,
32
,
32
,
2
);
MACE_
BM_SLICE
(
1
,
32
,
32
,
128
,
2
);
MACE_
BM_SLICE
(
1
,
32
,
32
,
256
,
2
);
MACE_
BM_SLICE
(
1
,
128
,
128
,
32
,
2
);
MACE_
BM_SLICE
(
1
,
128
,
128
,
128
,
2
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/softmax.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_Softmax
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/softmax.h
浏览文件 @
b649be5f
...
...
@@ -40,8 +40,8 @@ class SoftmaxOp : public Operator<D, T> {
kernels
::
SoftmaxFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
LOGITS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
LOGITS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/softmax_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -70,26 +70,26 @@ void SoftmaxBenchmark(
}
}
// namespace
#define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W); \
} \
BENCHMARK(
BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define
MACE_
BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
static void
MACE_
BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
SoftmaxBenchmark<DEVICE, TYPE>(iters, N, C, H, W);
\
}
\
MACE_BENCHMARK(MACE_
BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, GPU);
\
BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
#define
MACE_
BM_SOFTMAX(N, C, H, W) \
MACE_
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU);
\
MACE_
BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
BM_SOFTMAX
(
1
,
2
,
512
,
512
);
BM_SOFTMAX
(
1
,
3
,
512
,
512
);
BM_SOFTMAX
(
1
,
4
,
512
,
512
);
BM_SOFTMAX
(
1
,
10
,
256
,
256
);
BM_SOFTMAX
(
1
,
1024
,
7
,
7
);
MACE_
BM_SOFTMAX
(
1
,
2
,
512
,
512
);
MACE_
BM_SOFTMAX
(
1
,
3
,
512
,
512
);
MACE_
BM_SOFTMAX
(
1
,
4
,
512
,
512
);
MACE_
BM_SOFTMAX
(
1
,
10
,
256
,
256
);
MACE_
BM_SOFTMAX
(
1
,
1024
,
7
,
7
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/space_to_batch.cc
浏览文件 @
b649be5f
...
...
@@ -18,23 +18,23 @@ namespace mace {
namespace
ops
{
void
Register_SpaceToBatchND
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToBatchND"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SpaceToBatchNDOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/space_to_batch.h
浏览文件 @
b649be5f
...
...
@@ -45,8 +45,8 @@ class SpaceToBatchNDOp : public Operator<D, T> {
kernels
::
SpaceToBatchFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/space_to_batch_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -64,27 +64,27 @@ void BMSpaceToBatch(
}
}
// namespace
#define
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE)
\
static void \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(
\
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE); \
} \
BENCHMARK(
\
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define
MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, TYPE, DEVICE)
\
static void
\
MACE_BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE(
\
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);
\
}
\
MACE_BENCHMARK(
\
MACE_
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define
BM_SPACE_TO_BATCH(N, H, W, C, SHAPE)
\
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
#define
MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE)
\
MACE_
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
MACE_
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
16
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
8
);
MACE_
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
MACE_
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
MACE_
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
16
,
2
);
MACE_
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
MACE_
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
8
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/space_to_depth.cc
浏览文件 @
b649be5f
...
...
@@ -18,24 +18,24 @@ namespace mace {
namespace
ops
{
void
Register_SpaceToDepth
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
CPU
,
float
>
);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"SpaceToDepth"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
SpaceToDepthOp
<
DeviceType
::
GPU
,
half
>
);
#endif // MACE_ENABLE_OPENCL
}
...
...
mace/ops/space_to_depth.h
浏览文件 @
b649be5f
...
...
@@ -62,8 +62,8 @@ class SpaceToDepthOp : public Operator<D, T> {
}
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
private:
kernels
::
DepthToSpaceOpFunctor
<
D
,
T
>
functor_
;
...
...
mace/ops/space_to_depth_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -69,25 +69,26 @@ void SpaceToDepth(
}
}
// namespace
#define BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
SpaceToDepth<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
MACE_BENCHMARK( \
MACE_BM_SPACE_TO_DEPTH_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_DEPTH(N, C, H, W, G) \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU);
\
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
#define
MACE_
BM_SPACE_TO_DEPTH(N, C, H, W, G) \
MACE_
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU); \
MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU);
\
MACE_
BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);
BM_SPACE_TO_DEPTH
(
1
,
64
,
64
,
64
,
4
);
BM_SPACE_TO_DEPTH
(
1
,
64
,
128
,
128
,
4
);
BM_SPACE_TO_DEPTH
(
1
,
64
,
256
,
256
,
4
);
MACE_
BM_SPACE_TO_DEPTH
(
1
,
64
,
64
,
64
,
4
);
MACE_
BM_SPACE_TO_DEPTH
(
1
,
64
,
128
,
128
,
4
);
MACE_
BM_SPACE_TO_DEPTH
(
1
,
64
,
256
,
256
,
4
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/transpose.cc
浏览文件 @
b649be5f
...
...
@@ -18,11 +18,11 @@ namespace mace {
namespace
ops
{
void
Register_Transpose
(
OperatorRegistry
*
op_registry
)
{
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Transpose"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
TransposeOp
<
DeviceType
::
CPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Transpose"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
TransposeOp
<
DeviceType
::
CPU
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/transpose.h
浏览文件 @
b649be5f
...
...
@@ -50,8 +50,8 @@ class TransposeOp : public Operator<D, T> {
std
::
vector
<
int
>
dims_
;
kernels
::
TransposeFunctor
<
D
,
T
>
functor_
;
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace mace
...
...
mace/ops/transpose_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -55,41 +55,41 @@ void TransposeBenchmark(int iters,
}
}
// namespace
#define
BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE)
\
static void
BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE(
\
#define
MACE_BM_TRANSPOSE2D_MACRO(H, W, TYPE, DEVICE)
\
static void
MACE_BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE(
\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {H, W}, {1, 0}); \
} \
BENCHMARK(
BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE)
#define
BM_TRANSPOSE2D(H, W)
\
BM_TRANSPOSE2D_MACRO(H, W, float, CPU);
#define
BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE)
\
static void \
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE(
\
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3}); \
} \
BENCHMARK(
\
BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##DEVICE)
#define BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
BM_TRANSPOSE4D
(
1
,
512
,
512
,
3
,
0
,
3
,
1
,
2
);
BM_TRANSPOSE4D
(
1
,
2
,
512
,
512
,
0
,
2
,
3
,
1
);
BM_TRANSPOSE4D
(
1
,
64
,
64
,
512
,
0
,
3
,
1
,
2
);
BM_TRANSPOSE4D
(
1
,
512
,
64
,
64
,
0
,
2
,
3
,
1
);
BM_TRANSPOSE2D
(
128
,
128
);
BM_TRANSPOSE2D
(
512
,
512
);
MACE_BENCHMARK(MACE_
BM_TRANSPOSE2D_##H##_##W##_##TYPE##_##DEVICE)
#define
MACE_BM_TRANSPOSE2D(H, W)
\
MACE_
BM_TRANSPOSE2D_MACRO(H, W, float, CPU);
#define
MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, TYPE, DEVICE)
\
static void
\
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##
\
DEVICE(int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;
\
mace::testing::MaccProcessed(tot);
\
mace::testing::BytesProcessed(tot *(sizeof(TYPE)));
\
TransposeBenchmark<DEVICE, TYPE>(iters, {N, C, H, W}, {D0, D1, D2, D3});
\
}
\
MACE_BENCHMARK(
\
MACE_BM_TRANSPOSE4D_##N##_##C##_##H##_##W##_##D0##D1##D2##D3##_##TYPE##_##\
DEVICE)
#define MACE_BM_TRANSPOSE4D(N, C, H, W, D0, D1, D2, D3) \
MACE_BM_TRANSPOSE4D_MACRO(N, C, H, W, D0, D1, D2, D3, float, CPU);
MACE_
BM_TRANSPOSE4D
(
1
,
512
,
512
,
3
,
0
,
3
,
1
,
2
);
MACE_
BM_TRANSPOSE4D
(
1
,
2
,
512
,
512
,
0
,
2
,
3
,
1
);
MACE_
BM_TRANSPOSE4D
(
1
,
64
,
64
,
512
,
0
,
3
,
1
,
2
);
MACE_
BM_TRANSPOSE4D
(
1
,
512
,
64
,
64
,
0
,
2
,
3
,
1
);
MACE_
BM_TRANSPOSE2D
(
128
,
128
);
MACE_
BM_TRANSPOSE2D
(
512
,
512
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/winograd_inverse_transform.cc
浏览文件 @
b649be5f
...
...
@@ -19,17 +19,17 @@ namespace ops {
void
Register_WinogradInverseTransform
(
OperatorRegistry
*
op_registry
)
{
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradInverseTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
WinogradInverseTransformOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradInverseTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
WinogradInverseTransformOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradInverseTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
WinogradInverseTransformOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradInverseTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
WinogradInverseTransformOp
<
DeviceType
::
GPU
,
half
>
);
#else
MACE_UNUSED
(
op_registry
);
#endif // MACE_ENABLE_OPENCL
...
...
mace/ops/winograd_inverse_transform.h
浏览文件 @
b649be5f
...
...
@@ -49,8 +49,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels
::
WinogradInverseTransformFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
,
BIAS
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
,
BIAS
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/winograd_transform.cc
浏览文件 @
b649be5f
...
...
@@ -19,17 +19,17 @@ namespace ops {
void
Register_WinogradTransform
(
OperatorRegistry
*
op_registry
)
{
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
WinogradTransformOp
<
DeviceType
::
GPU
,
float
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
WinogradTransformOp
<
DeviceType
::
GPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
WinogradTransformOp
<
DeviceType
::
GPU
,
half
>
);
MACE_
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"WinogradTransform"
)
.
Device
(
DeviceType
::
GPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
WinogradTransformOp
<
DeviceType
::
GPU
,
half
>
);
#else
MACE_UNUSED
(
op_registry
);
#endif // MACE_ENABLE_OPENCL
...
...
mace/ops/winograd_transform.h
浏览文件 @
b649be5f
...
...
@@ -43,8 +43,8 @@ class WinogradTransformOp : public Operator<D, T> {
kernels
::
WinogradTransformFunctor
<
D
,
T
>
functor_
;
protected:
OP_INPUT_TAGS
(
INPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
MACE_
OP_INPUT_TAGS
(
INPUT
);
MACE_
OP_OUTPUT_TAGS
(
OUTPUT
);
};
}
// namespace ops
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
b649be5f
...
...
@@ -51,22 +51,24 @@ void BMWinogradTransform(
}
}
// namespace
#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define
MACE_
BM_WINOGRAD_TRANSFORM(N, H, W, C) \
MACE_
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
);
MACE_
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
);
MACE_
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
);
MACE_
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -103,24 +105,24 @@ void BMWinogradInverseTransform(
}
}
// namespace
#define
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)
\
#define
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)
\
static void \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(
\
int iters) {
\
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(
\
int iters) {
\
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
} \
BENCHMARK(
\
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK(
\
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
);
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
);
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
);
MACE_
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
);
}
// namespace test
}
// namespace ops
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录