Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
780f5a60
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
780f5a60
编写于
2月 08, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor conv2d NEON implementations
上级
07f8ff18
变更
52
展开全部
隐藏空白更改
内联
并排
Showing
52 changed file
with
904 addition
and
1549 deletion
+904
-1549
mace/core/tensor.h
mace/core/tensor.h
+2
-1
mace/kernels/BUILD
mace/kernels/BUILD
+4
-5
mace/kernels/addn.h
mace/kernels/addn.h
+6
-3
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+3
-3
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+518
-49
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+88
-41
mace/kernels/conv_pool_2d_util.h
mace/kernels/conv_pool_2d_util.h
+6
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+2
-2
mace/kernels/neon/addn_neon.cc
mace/kernels/neon/addn_neon.cc
+3
-1
mace/kernels/neon/avg_pooling_neon_3x3.cc
mace/kernels/neon/avg_pooling_neon_3x3.cc
+0
-224
mace/kernels/neon/batch_norm_neon.cc
mace/kernels/neon/batch_norm_neon.cc
+30
-18
mace/kernels/neon/global_avg_pooling_neon.cc
mace/kernels/neon/global_avg_pooling_neon.cc
+0
-56
mace/kernels/neon/max_pooling_neon_2x2.cc
mace/kernels/neon/max_pooling_neon_2x2.cc
+0
-173
mace/kernels/neon/max_pooling_neon_3x3.cc
mace/kernels/neon/max_pooling_neon_3x3.cc
+0
-220
mace/kernels/neon/pooling_neon.cc
mace/kernels/neon/pooling_neon.cc
+0
-131
mace/kernels/neon/relu_neon.cc
mace/kernels/neon/relu_neon.cc
+0
-70
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+25
-20
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+2
-2
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+5
-4
mace/ops/activation.cc
mace/ops/activation.cc
+0
-8
mace/ops/activation_benchmark.cc
mace/ops/activation_benchmark.cc
+10
-9
mace/ops/activation_test.cc
mace/ops/activation_test.cc
+0
-36
mace/ops/addn_benchmark.cc
mace/ops/addn_benchmark.cc
+11
-11
mace/ops/addn_test.cc
mace/ops/addn_test.cc
+0
-6
mace/ops/batch_norm_benchmark.cc
mace/ops/batch_norm_benchmark.cc
+20
-17
mace/ops/batch_norm_test.cc
mace/ops/batch_norm_test.cc
+8
-16
mace/ops/batch_to_space_benchmark.cc
mace/ops/batch_to_space_benchmark.cc
+6
-6
mace/ops/bias_add.cc
mace/ops/bias_add.cc
+0
-10
mace/ops/bias_add_benchmark.cc
mace/ops/bias_add_benchmark.cc
+18
-17
mace/ops/buffer_to_image.cc
mace/ops/buffer_to_image.cc
+1
-1
mace/ops/buffer_to_image.h
mace/ops/buffer_to_image.h
+2
-2
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+0
-14
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+44
-48
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+12
-164
mace/ops/depthwise_conv2d.cc
mace/ops/depthwise_conv2d.cc
+0
-8
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+0
-19
mace/ops/depthwise_conv_2d_benchmark.cc
mace/ops/depthwise_conv_2d_benchmark.cc
+16
-15
mace/ops/folded_batch_norm.cc
mace/ops/folded_batch_norm.cc
+0
-8
mace/ops/fused_conv_2d.cc
mace/ops/fused_conv_2d.cc
+0
-6
mace/ops/fused_conv_2d_test.cc
mace/ops/fused_conv_2d_test.cc
+6
-6
mace/ops/global_avg_pooling.cc
mace/ops/global_avg_pooling.cc
+0
-8
mace/ops/global_avg_pooling_test.cc
mace/ops/global_avg_pooling_test.cc
+0
-26
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+6
-6
mace/ops/pooling.cc
mace/ops/pooling.cc
+0
-8
mace/ops/resize_bilinear.cc
mace/ops/resize_bilinear.cc
+0
-8
mace/ops/resize_bilinear_benchmark.cc
mace/ops/resize_bilinear_benchmark.cc
+12
-12
mace/ops/softmax_benchmark.cc
mace/ops/softmax_benchmark.cc
+9
-8
mace/ops/space_to_batch_benchmark.cc
mace/ops/space_to_batch_benchmark.cc
+6
-6
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+4
-4
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+11
-11
mace/utils/utils.h
mace/utils/utils.h
+5
-0
tools/bazel-adb-run.sh
tools/bazel-adb-run.sh
+3
-2
未找到文件。
mace/core/tensor.h
浏览文件 @
780f5a60
...
@@ -288,7 +288,8 @@ class Tensor {
...
@@ -288,7 +288,8 @@ class Tensor {
}
}
CASES
(
dtype_
,
(
os
<<
(
this
->
data
<
T
>
()[
i
])
<<
", "
));
CASES
(
dtype_
,
(
os
<<
(
this
->
data
<
T
>
()[
i
])
<<
", "
));
}
}
LOG
(
INFO
)
<<
os
.
str
();
LOG
(
INFO
)
<<
"Tensor size: ["
<<
dim
(
0
)
<<
", "
<<
dim
(
1
)
<<
", "
<<
dim
(
2
)
<<
", "
<<
dim
(
3
)
<<
"], content:
\n
"
<<
os
.
str
();
}
}
inline
size_t
SizeOfType
()
const
{
inline
size_t
SizeOfType
()
const
{
...
...
mace/kernels/BUILD
浏览文件 @
780f5a60
...
@@ -15,15 +15,14 @@ cc_library(
...
@@ -15,15 +15,14 @@ cc_library(
"*.cc"
,
"*.cc"
,
"opencl/*.cc"
,
"opencl/*.cc"
,
])
+
if_neon_enabled
(
glob
([
])
+
if_neon_enabled
(
glob
([
"neon/*.cc"
,
"neon/addn_neon.cc"
,
"neon/batch_norm_neon.cc"
,
])),
])),
hdrs
=
glob
([
hdrs
=
glob
([
"*.h"
,
"*.h"
,
"opencl/*.h"
,
"opencl/*.h"
,
])
+
if_neon_enabled
(
glob
([
]),
"neon/*.h"
,
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
]),
])),
copts
=
if_openmp_enabled
([
"-fopenmp"
]),
linkopts
=
if_android
([
"-lm"
]),
linkopts
=
if_android
([
"-lm"
]),
deps
=
[
deps
=
[
"//mace/core"
,
"//mace/core"
,
...
...
mace/kernels/addn.h
浏览文件 @
780f5a60
...
@@ -5,6 +5,10 @@
...
@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_ADDN_H_
#ifndef MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/future.h"
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/cl2_header.h"
...
@@ -12,7 +16,6 @@
...
@@ -12,7 +16,6 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
AddNFunctor
{
struct
AddNFunctor
{
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
void
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
...
@@ -47,7 +50,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
...
@@ -47,7 +50,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
};
};
}
//
namespace kernels
}
// namespace kernels
}
//
namespace mace
}
// namespace mace
#endif // MACE_KERNELS_ADDN_H_
#endif // MACE_KERNELS_ADDN_H_
mace/kernels/batch_norm.h
浏览文件 @
780f5a60
...
@@ -136,7 +136,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
...
@@ -136,7 +136,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
};
};
}
//
namepsace kernels
}
// namepsace kernels
}
//
namespace mace
}
// namespace mace
#endif //
MACE_KERNELS_BATCH_NORM_H_
#endif // MACE_KERNELS_BATCH_NORM_H_
mace/kernels/conv_2d.h
浏览文件 @
780f5a60
此差异已折叠。
点击以展开。
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
780f5a60
...
@@ -17,7 +17,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
...
@@ -17,7 +17,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
@@ -51,7 +51,8 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
...
@@ -51,7 +51,8 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
}
// Note: TensorFlow may padded one more on the right/bottom side
// Note: TensorFlow may padded one more on the right/bottom side
...
@@ -59,12 +60,10 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
...
@@ -59,12 +60,10 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
// utilize the more centered features. We need to benchmark
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
// based on the model accuracy.
padding_size
[
0
]
=
padding_size
[
0
]
=
std
::
max
<
int
>
(
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
padding_size
[
1
]
=
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
1
]
=
output_channels
;
output_shape
[
1
]
=
output_channels
;
...
@@ -73,7 +72,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
...
@@ -73,7 +72,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
}
}
void
CalcNHWCPaddingAndOutputSize
(
const
index_t
*
input_shape
,
// NHWC
void
CalcNHWCPaddingAndOutputSize
(
const
index_t
*
input_shape
,
// NHWC
const
index_t
*
filter_shape
,
// HW
IO
const
index_t
*
filter_shape
,
// HW
OI
const
int
*
dilations
,
const
int
*
dilations
,
const
int
*
strides
,
const
int
*
strides
,
Padding
padding
,
Padding
padding
,
...
@@ -82,7 +81,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
...
@@ -82,7 +81,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
@@ -98,7 +97,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
...
@@ -98,7 +97,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
index_t
output_height
=
0
,
output_width
=
0
;
index_t
output_height
=
0
,
output_width
=
0
;
index_t
kernel_height
=
filter_shape
[
0
];
index_t
kernel_height
=
filter_shape
[
0
];
index_t
kernel_width
=
filter_shape
[
1
];
index_t
kernel_width
=
filter_shape
[
1
];
index_t
output_channels
=
filter_shape
[
3
];
index_t
output_channels
=
filter_shape
[
2
];
index_t
k_extent_height
=
(
kernel_height
-
1
)
*
dilations
[
0
]
+
1
;
index_t
k_extent_height
=
(
kernel_height
-
1
)
*
dilations
[
0
]
+
1
;
index_t
k_extent_width
=
(
kernel_width
-
1
)
*
dilations
[
1
]
+
1
;
index_t
k_extent_width
=
(
kernel_width
-
1
)
*
dilations
[
1
]
+
1
;
...
@@ -116,7 +115,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
...
@@ -116,7 +115,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
output_height
=
(
input_shape
[
1
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_height
=
(
input_shape
[
1
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
2
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
output_width
=
(
input_shape
[
2
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
}
// Note: TensorFlow may padded one more on the right/bottom side
// Note: TensorFlow may padded one more on the right/bottom side
...
@@ -124,12 +124,10 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
...
@@ -124,12 +124,10 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
// utilize the more centered features. We need to benchmark
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
// based on the model accuracy.
padding_size
[
0
]
=
padding_size
[
0
]
=
std
::
max
<
int
>
(
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
1
]);
+
k_extent_height
-
input_shape
[
1
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
padding_size
[
1
]
=
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
2
]);
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
2
]);
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
1
]
=
output_height
;
output_shape
[
1
]
=
output_height
;
...
@@ -146,7 +144,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
...
@@ -146,7 +144,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
padding_size
);
MACE_CHECK_NOTNULL
(
padding_size
);
...
@@ -167,19 +165,18 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
...
@@ -167,19 +165,18 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_height
=
(
input_shape
[
2
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
output_width
=
(
input_shape
[
3
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
}
// Note: TensorFlow may padded one more on the right/bottom side
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// TODO may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
// based on the model accuracy.
padding_size
[
0
]
=
padding_size
[
0
]
=
std
::
max
<
int
>
(
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
2
]);
+
k_extent_height
-
input_shape
[
2
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
padding_size
[
1
]
=
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
3
]);
}
}
void
ConstructInputWithPadding
(
const
Tensor
*
input_tensor
,
void
ConstructInputWithPadding
(
const
Tensor
*
input_tensor
,
...
@@ -206,18 +203,18 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
...
@@ -206,18 +203,18 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
output_tensor
->
Resize
(
output_shape
);
output_tensor
->
Resize
(
output_shape
);
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_
ptr
=
output_tensor
->
mutable_data
<
float
>
();
float
*
output_
data
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_
ptr
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
memset
(
output_
data
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
// Skip the padded top rows
// Skip the padded top rows
if
(
padding_same_value
)
{
if
(
padding_same_value
)
{
#define COPY_INPUT \
#define COPY_INPUT
\
std::fill(output_ptr, output_ptr+padded_left, input[0]);
\
std::fill(output_data, output_data + padded_left, input[0]);
\
output_ptr += padded_left;
\
output_data += padded_left;
\
memcpy(output_ptr, input, width * sizeof(float));
\
memcpy(output_data, input, width * sizeof(float));
\
output_ptr += width;
\
output_data += width;
\
std::fill(output_ptr , output_ptr + padded_right, input[width-
1]); \
std::fill(output_data, output_data + padded_right, input[width -
1]); \
output_ptr
+= padded_right;
output_data
+= padded_right;
const
int
padded_bottom
=
paddings
[
0
]
-
padded_top
;
const
int
padded_bottom
=
paddings
[
0
]
-
padded_top
;
const
int
padded_right
=
paddings
[
1
]
-
padded_left
;
const
int
padded_right
=
paddings
[
1
]
-
padded_left
;
...
@@ -239,19 +236,69 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
...
@@ -239,19 +236,69 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
}
}
#undef COPY_INPUT
#undef COPY_INPUT
}
else
{
}
else
{
output_
ptr
+=
padded_top
*
output_width
;
output_
data
+=
padded_top
*
output_width
;
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
i
=
0
;
i
<
batch
;
++
i
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
j
=
0
;
j
<
channels
;
++
j
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
for
(
int
k
=
0
;
k
<
height
;
++
k
)
{
memcpy
(
output_
ptr
+
padded_left
,
input
,
width
*
sizeof
(
float
));
memcpy
(
output_
data
+
padded_left
,
input
,
width
*
sizeof
(
float
));
input
+=
width
;
input
+=
width
;
output_
ptr
+=
output_width
;
output_
data
+=
output_width
;
}
}
// Skip the padded bottom in this channel and top in the next channel
// Skip the padded bottom in this channel and top in the next channel
output_
ptr
+=
paddings
[
0
]
*
output_width
;
output_
data
+=
paddings
[
0
]
*
output_width
;
}
}
}
}
}
}
}
}
}
// namespace kernels
}
// namespace mace
void
ConstructNHWCInputWithPadding
(
const
Tensor
*
input_tensor
,
const
int
*
paddings
,
Tensor
*
output_tensor
,
bool
padding_same_value
)
{
VLOG
(
1
)
<<
"input: "
<<
input_tensor
->
NumElements
();
Tensor
::
MappingGuard
input_mapper
(
input_tensor
);
const
float
*
input
=
input_tensor
->
data
<
float
>
();
const
index_t
*
input_shape
=
input_tensor
->
shape
().
data
();
index_t
batch
=
input_shape
[
0
];
index_t
height
=
input_shape
[
1
];
index_t
width
=
input_shape
[
2
];
index_t
channels
=
input_shape
[
3
];
std
::
vector
<
index_t
>
output_shape
(
{
batch
,
paddings
[
0
]
+
height
,
paddings
[
1
]
+
width
,
channels
});
const
int
output_height
=
output_shape
[
1
];
const
int
output_width
=
output_shape
[
2
];
const
int
padded_top
=
paddings
[
0
]
/
2
;
const
int
padded_left
=
paddings
[
1
]
/
2
;
output_tensor
->
Resize
(
output_shape
);
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_data
,
0
,
output_tensor
->
size
()
*
sizeof
(
float
));
// Skip the padded top rows
if
(
padding_same_value
)
{
LOG
(
FATAL
)
<<
"Not implemented"
;
}
else
{
#pragma omp parallel for collapse(3)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
const
float
*
input_ptr
=
input
+
((
n
*
height
+
h
)
*
width
+
w
)
*
channels
;
float
*
output_ptr
=
output_data
+
((
n
*
output_height
+
h
+
padded_top
)
*
output_width
+
w
+
padded_left
)
*
channels
;
memcpy
(
output_ptr
,
input_ptr
,
channels
*
sizeof
(
float
));
}
}
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/conv_pool_2d_util.h
浏览文件 @
780f5a60
...
@@ -44,6 +44,12 @@ void ConstructInputWithPadding(const Tensor *input,
...
@@ -44,6 +44,12 @@ void ConstructInputWithPadding(const Tensor *input,
const
int
*
paddings
,
const
int
*
paddings
,
Tensor
*
output_tensor
,
Tensor
*
output_tensor
,
bool
padding_same_value
=
false
);
bool
padding_same_value
=
false
);
void
ConstructNHWCInputWithPadding
(
const
Tensor
*
input
,
const
int
*
paddings
,
Tensor
*
output_tensor
,
bool
padding_same_value
=
false
);
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
780f5a60
...
@@ -64,8 +64,8 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
...
@@ -64,8 +64,8 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
1
;
fake_filter_shape
[
3
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
int
>
paddings
(
2
);
...
...
mace/kernels/neon/addn_neon.cc
浏览文件 @
780f5a60
...
@@ -10,9 +10,11 @@ namespace kernels {
...
@@ -10,9 +10,11 @@ namespace kernels {
template
<
>
template
<
>
void
AddNFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
void
AddNFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
const
std
::
vector
<
const
Tensor
*>
&
input_tensors
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
// TODO: neon mem copy
// TODO: neon mem copy
output_tensor
->
ResizeLike
(
input_tensors
[
0
]);
index_t
size
=
output_tensor
->
size
();
index_t
size
=
output_tensor
->
size
();
float
*
output_ptr
=
output_tensor
->
mutable_data
<
float
>
();
float
*
output_ptr
=
output_tensor
->
mutable_data
<
float
>
();
memset
(
output_ptr
,
0
,
size
*
sizeof
(
float
));
memset
(
output_ptr
,
0
,
size
*
sizeof
(
float
));
...
...
mace/kernels/neon/avg_pooling_neon_3x3.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <float.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingAvgNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
float
avg_factors
[
4
]
=
{
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
};
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
const
float
*
r0
,
*
r1
,
*
r2
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
r2
=
r1
+
in_width
;
if
(
padding_left
>
0
)
{
if
(
padding_left
==
1
)
{
float
sum0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
sum1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
float
max2
=
std
::
max
(
r2
[
0
],
r2
[
1
]);
*
outptr
=
(
r0
[
0
]
+
r0
[
1
]
+
r1
[
0
]
+
r1
[
1
]
+
r2
[
0
]
+
r2
[
1
])
/
9.0
;
++
r0
;
++
r1
;
}
else
{
// padding_left == 2
*
outptr
=
(
r0
[
0
]
+
r1
[
0
]
+
r2
[
0
])
/
9.0
;
}
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
float32x4_t
factors
=
vld1q_f32
(
avg_factors
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
sum0
=
vaddq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
sum1
=
vaddq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
sum2
=
vaddq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
sum0
=
vaddq_f32
(
sum0
,
row02
);
sum1
=
vaddq_f32
(
sum1
,
row12
);
sum2
=
vaddq_f32
(
sum2
,
row22
);
float32x4_t
sum_result
=
vaddq_f32
(
vaddq_f32
(
sum0
,
sum1
),
sum2
);
float32x4_t
avg_result
=
vmulq_f32
(
sum_result
,
factors
);
vst1q_f32
(
outptr
,
avg_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
sum
=
0.0
;
for
(
int
kh
=
0
;
kh
<
3
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
3
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
sum
+=
input
[
input_offset
+
inh
*
in_width
+
inw
];
}
}
}
*
outptr
=
sum
/
9.0
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingAvgNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
float
avg_factors
[
4
]
=
{
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
,
1.0
/
9.0
};
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
r0
+
in_width
;
const
float
*
r2
=
r1
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
h
++
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
float32x4_t
factors
=
vld1q_f32
(
avg_factors
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
sum0
=
vaddq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
sum1
=
vaddq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
sum2
=
vaddq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
sum0
=
vaddq_f32
(
sum0
,
row02
);
sum1
=
vaddq_f32
(
sum1
,
row12
);
sum2
=
vaddq_f32
(
sum2
,
row22
);
float32x4_t
sum_result
=
vaddq_f32
(
vaddq_f32
(
sum0
,
sum1
),
sum2
);
float32x4_t
avg_result
=
vmulq_f32
(
sum_result
,
factors
);
vst1q_f32
(
outptr
,
avg_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
*
outptr
=
(
r0
[
0
]
+
r0
[
1
]
+
r0
[
2
]
+
r1
[
0
]
+
r1
[
1
]
+
r1
[
2
]
+
r2
[
0
]
+
r2
[
1
]
+
r2
[
2
])
/
9.0
;
r0
+=
2
;
r1
+=
2
;
r2
+=
2
;
outptr
++
;
}
r0
+=
1
+
in_width
;
r1
+=
1
+
in_width
;
r2
+=
1
+
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/batch_norm_neon.cc
浏览文件 @
780f5a60
...
@@ -15,6 +15,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
...
@@ -15,6 +15,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const
Tensor
*
offset
,
const
Tensor
*
offset
,
const
Tensor
*
mean
,
const
Tensor
*
mean
,
const
Tensor
*
var
,
const
Tensor
*
var
,
const
float
epsilon
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
...
@@ -26,8 +27,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
...
@@ -26,8 +27,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
// new_offset = \offset - mean * common_val;
// new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset;
// Y = new_scale * X + new_offset;
const
index_t
n
=
input
->
dim
(
0
);
const
index_t
n
=
input
->
dim
(
0
);
const
index_t
channel
=
input
->
dim
(
1
);
const
index_t
sample_size
=
input
->
dim
(
1
)
*
input
->
dim
(
2
);
const
index_t
sample_size
=
input
->
dim
(
2
)
*
input
->
dim
(
3
);
const
index_t
channel
=
input
->
dim
(
3
);
const
float
*
input_ptr
=
input
->
data
<
float
>
();
const
float
*
input_ptr
=
input
->
data
<
float
>
();
const
float
*
scale_ptr
=
scale
->
data
<
float
>
();
const
float
*
scale_ptr
=
scale
->
data
<
float
>
();
...
@@ -36,36 +37,47 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
...
@@ -36,36 +37,47 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const
float
*
var_ptr
=
var
->
data
<
float
>
();
const
float
*
var_ptr
=
var
->
data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
index_t
count
=
sample_size
>>
2
;
const
index_t
ch_blks
=
channel
>>
2
;
index_t
remain_count
=
sample_size
-
(
count
<<
2
);
const
index_t
remain_chs
=
channel
-
(
ch_blks
<<
2
);
std
::
vector
<
float
>
new_scale
(
channel
);
std
::
vector
<
float
>
new_offset
(
channel
);
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
for
(
index_t
c
=
0
;
c
<
channel
;
++
c
)
{
float
new_scale
=
scale_ptr
[
c
]
/
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon_
);
new_scale
[
c
]
=
scale_ptr
[
c
]
/
std
::
sqrt
(
var_ptr
[
c
]
+
epsilon
);
float
new_offset
=
offset_ptr
[
c
]
-
mean_ptr
[
c
]
*
new_scale
;
new_offset
[
c
]
=
offset_ptr
[
c
]
-
mean_ptr
[
c
]
*
new_scale
[
c
];
index_t
pos
=
c
*
sample_size
;
}
float32x4_t
new_scale_f
=
vdupq_n_f32
(
new_scale
);
float32x4_t
new_offset_f
=
vdupq_n_f32
(
new_offset
);
for
(
index_t
i
=
0
;
i
<
n
;
++
i
)
{
const
float
*
input_sample_ptr
=
input_ptr
+
pos
;
float
*
output_sample_ptr
=
output_ptr
+
pos
;
for
(
index_t
j
=
0
;
j
<
count
;
++
j
)
{
#pragma omp parallel for collapse(2)
for
(
index_t
i
=
0
;
i
<
n
;
++
i
)
{
for
(
index_t
j
=
0
;
j
<
sample_size
;
++
j
)
{
const
float
*
input_sample_ptr
=
input_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
float
*
output_sample_ptr
=
output_ptr
+
(
i
*
sample_size
+
j
)
*
channel
;
const
float
*
new_scale_ptr
=
new_scale
.
data
();
const
float
*
new_offset_ptr
=
new_offset
.
data
();
for
(
index_t
cb
=
0
;
cb
<
ch_blks
;
++
cb
)
{
float32x4_t
new_scale_f
=
vld1q_f32
(
new_scale_ptr
);
float32x4_t
new_offset_f
=
vld1q_f32
(
new_offset_ptr
);
float32x4_t
input_f
=
vld1q_f32
(
input_sample_ptr
);
float32x4_t
input_f
=
vld1q_f32
(
input_sample_ptr
);
float32x4_t
output_f
=
vfmaq_f32
(
new_offset_f
,
input_f
,
new_scale_f
);
float32x4_t
output_f
=
vfmaq_f32
(
new_offset_f
,
input_f
,
new_scale_f
);
vst1q_f32
(
output_sample_ptr
,
output_f
);
vst1q_f32
(
output_sample_ptr
,
output_f
);
input_sample_ptr
+=
4
;
input_sample_ptr
+=
4
;
output_sample_ptr
+=
4
;
output_sample_ptr
+=
4
;
new_scale_ptr
+=
4
;
new_offset_ptr
+=
4
;
}
}
for
(
index_t
j
=
0
;
j
<
remain_count
;
++
j
)
{
for
(
index_t
c
=
(
ch_blks
<<
2
);
c
<
channel
;
++
c
)
{
*
output_sample_ptr
=
new_scale
*
*
input_sample_ptr
+
new_offset
;
*
output_sample_ptr
=
new_scale
[
c
]
*
*
input_sample_ptr
+
new_offset
[
c
]
;
++
output_sample_ptr
;
++
output_sample_ptr
;
++
input_sample_ptr
;
++
input_sample_ptr
;
++
new_scale_ptr
;
++
new_offset_ptr
;
}
}
pos
+=
channel
*
sample_size
;
}
}
}
}
};
};
}
// namespace kernels
}
// namespace kernels
}
//
namespace mace
}
// namespace mace
mace/kernels/neon/global_avg_pooling_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/global_avg_pooling.h"
#include <arm_neon.h>
namespace
mace
{
namespace
kernels
{
template
<
>
void
GlobalAvgPoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
float
*
input
,
const
index_t
*
input_shape
,
float
*
output
,
StatsFuture
*
future
)
{
index_t
batch
=
input_shape
[
0
];
index_t
channels
=
input_shape
[
1
];
index_t
height
=
input_shape
[
2
];
index_t
width
=
input_shape
[
3
];
index_t
image_size
=
height
*
width
;
index_t
input_offset
=
0
;
index_t
total_channels
=
batch
*
channels
;
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
total_channels
;
++
c
)
{
const
float
*
inptr
=
input
+
c
*
image_size
;
float
sum
=
0.0
;
int
num_vectors
=
image_size
>>
2
;
int
remain
=
image_size
-
(
num_vectors
<<
2
);
if
(
num_vectors
>
0
)
{
float
sum_out
[
4
]
=
{
0.0
,
0.0
,
0.0
,
0.0
};
float32x4_t
sum_vector
=
vld1q_f32
(
inptr
);
inptr
+=
4
;
for
(
int
n
=
1
;
n
<
num_vectors
;
++
n
)
{
float32x4_t
vector
=
vld1q_f32
(
inptr
);
sum_vector
=
vaddq_f32
(
sum_vector
,
vector
);
inptr
+=
4
;
}
vst1q_f32
(
sum_out
,
sum_vector
);
sum
=
sum_out
[
0
]
+
sum_out
[
1
]
+
sum_out
[
2
]
+
sum_out
[
3
];
}
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
sum
+=
*
inptr
;
++
inptr
;
}
output
[
c
]
=
sum
/
image_size
;
}
};
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/max_pooling_neon_2x2.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <float.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingMaxNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
,
*
r1
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
if
(
padding_left
>
0
)
{
*
outptr
=
std
::
max
(
r0
[
0
],
r1
[
0
]);
++
r0
;
++
r1
;
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4_t
r00
=
vld1q_f32
(
r0
);
float32x4_t
r10
=
vld1q_f32
(
r1
);
float32x4_t
r01
=
vld1q_f32
(
r0
+
4
);
float32x4_t
r11
=
vld1q_f32
(
r1
+
4
);
float32x4_t
max0
=
vmaxq_f32
(
r00
,
r10
);
float32x4_t
max1
=
vmaxq_f32
(
r01
,
r11
);
float32x4_t
max_result
=
vpmaxq_f32
(
max0
,
max1
);
vst1q_f32
(
outptr
,
max_result
);
r0
+=
8
;
r1
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
max
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
kh
=
0
;
kh
<
2
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
2
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
max
=
std
::
max
(
max
,
input
[
input_offset
+
inh
*
in_width
+
inw
]);
}
}
}
*
outptr
=
max
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingMaxNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
img0
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4_t
r00
=
vld1q_f32
(
r0
);
float32x4_t
r10
=
vld1q_f32
(
r1
);
float32x4_t
r01
=
vld1q_f32
(
r0
+
4
);
float32x4_t
r11
=
vld1q_f32
(
r1
+
4
);
float32x4_t
max0
=
vmaxq_f32
(
r00
,
r10
);
float32x4_t
max1
=
vmaxq_f32
(
r01
,
r11
);
float32x4_t
max_result
=
vpmaxq_f32
(
max0
,
max1
);
vst1q_f32
(
outptr
,
max_result
);
r0
+=
8
;
r1
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
float
max0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
max1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
*
outptr
=
std
::
max
(
max0
,
max1
);
r0
+=
2
;
r1
+=
2
;
outptr
++
;
}
r0
+=
in_width
;
r1
+=
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/max_pooling_neon_3x3.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <limits>
#include "mace/core/common.h"
namespace
mace
{
namespace
kernels
{
void
PoolingMaxNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
padding_top
=
paddings
[
0
]
/
2
;
int
padding_bottom
=
paddings
[
0
]
-
padding_top
;
int
padding_left
=
paddings
[
1
]
/
2
;
int
padding_right
=
paddings
[
1
]
-
padding_left
;
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
float
*
outptr
=
output
+
output_offset
;
for
(
int
h
=
0
;
h
<
out_height
;
++
h
)
{
int
w
=
0
;
int
num_vectors
=
0
;
const
float
*
r0
,
*
r1
,
*
r2
;
if
(
!
((
h
==
0
&&
padding_top
>
0
)
||
(
h
==
out_height
-
1
&&
padding_bottom
>
0
)))
{
r0
=
input
+
input_offset
+
(
h
*
2
-
padding_top
)
*
in_width
;
r1
=
r0
+
in_width
;
r2
=
r1
+
in_width
;
if
(
padding_left
>
0
)
{
if
(
padding_left
==
1
)
{
float
max0
=
std
::
max
(
r0
[
0
],
r0
[
1
]);
float
max1
=
std
::
max
(
r1
[
0
],
r1
[
1
]);
float
max2
=
std
::
max
(
r2
[
0
],
r2
[
1
]);
*
outptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
++
r0
;
++
r1
;
}
else
{
// padding_left == 2
float
max_tmp
=
std
::
max
(
r0
[
0
],
r1
[
0
]);
*
outptr
=
std
::
max
(
max_tmp
,
r2
[
0
]);
}
++
outptr
;
++
w
;
}
if
(
padding_right
>
0
)
{
num_vectors
=
(
out_width
-
w
-
1
)
>>
2
;
}
else
{
num_vectors
=
(
out_width
-
w
)
>>
2
;
}
}
w
+=
num_vectors
<<
2
;
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
--
num_vectors
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
max0
=
vmaxq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
max1
=
vmaxq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
max2
=
vmaxq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
max0
=
vmaxq_f32
(
max0
,
row02
);
max1
=
vmaxq_f32
(
max1
,
row12
);
max2
=
vmaxq_f32
(
max2
,
row22
);
float32x4_t
max_result
=
vmaxq_f32
(
vmaxq_f32
(
max0
,
max1
),
max2
);
vst1q_f32
(
outptr
,
max_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
w
<
out_width
;
++
w
)
{
float
max
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
kh
=
0
;
kh
<
3
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
3
;
++
kw
)
{
int
inh
=
h
*
2
-
padding_top
+
kh
;
int
inw
=
w
*
2
-
padding_left
+
kw
;
if
(
inh
>=
0
&&
inh
<
in_height
&&
inw
>=
0
&&
inw
<
in_width
)
{
max
=
std
::
max
(
max
,
input
[
input_offset
+
inh
*
in_width
+
inw
]);
}
}
}
*
outptr
=
max
;
++
outptr
;
}
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
// assume the input has already been padded
void
PoolingMaxNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
)
{
index_t
batch
=
in_shape
[
0
];
index_t
channels
=
in_shape
[
1
];
index_t
in_height
=
in_shape
[
2
];
index_t
in_width
=
in_shape
[
3
];
index_t
out_height
=
out_shape
[
2
];
index_t
out_width
=
out_shape
[
3
];
int
in_image_size
=
in_height
*
in_width
;
int
out_image_size
=
out_height
*
out_width
;
index_t
input_offset
=
0
;
index_t
output_offset
=
0
;
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
const
float
*
img0
=
input
+
input_offset
;
float
*
outptr
=
output
+
output_offset
;
const
float
*
r0
=
img0
;
const
float
*
r1
=
r0
+
in_width
;
const
float
*
r2
=
r1
+
in_width
;
for
(
int
h
=
0
;
h
<
out_height
;
h
++
)
{
int
num_vectors
=
out_width
>>
2
;
int
remain
=
out_width
-
(
num_vectors
<<
2
);
float32x4x2_t
row0
=
vld2q_f32
(
r0
);
float32x4x2_t
row1
=
vld2q_f32
(
r1
);
float32x4x2_t
row2
=
vld2q_f32
(
r2
);
for
(;
num_vectors
>
0
;
num_vectors
--
)
{
float32x4x2_t
row0_next
=
vld2q_f32
(
r0
+
8
);
float32x4x2_t
row1_next
=
vld2q_f32
(
r1
+
8
);
float32x4x2_t
row2_next
=
vld2q_f32
(
r2
+
8
);
float32x4_t
max0
=
vmaxq_f32
(
row0
.
val
[
0
],
row0
.
val
[
1
]);
float32x4_t
max1
=
vmaxq_f32
(
row1
.
val
[
0
],
row1
.
val
[
1
]);
float32x4_t
max2
=
vmaxq_f32
(
row2
.
val
[
0
],
row2
.
val
[
1
]);
float32x4_t
row02
=
vextq_f32
(
row0
.
val
[
0
],
row0_next
.
val
[
0
],
1
);
float32x4_t
row12
=
vextq_f32
(
row1
.
val
[
0
],
row1_next
.
val
[
0
],
1
);
float32x4_t
row22
=
vextq_f32
(
row2
.
val
[
0
],
row2_next
.
val
[
0
],
1
);
max0
=
vmaxq_f32
(
max0
,
row02
);
max1
=
vmaxq_f32
(
max1
,
row12
);
max2
=
vmaxq_f32
(
max2
,
row22
);
float32x4_t
max_result
=
vmaxq_f32
(
vmaxq_f32
(
max0
,
max1
),
max2
);
vst1q_f32
(
outptr
,
max_result
);
row0
=
row0_next
;
row1
=
row1_next
;
row2
=
row2_next
;
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
float
max0
=
std
::
max
(
std
::
max
(
r0
[
0
],
r0
[
1
]),
r0
[
2
]);
float
max1
=
std
::
max
(
std
::
max
(
r1
[
0
],
r1
[
1
]),
r1
[
2
]);
float
max2
=
std
::
max
(
std
::
max
(
r2
[
0
],
r2
[
1
]),
r2
[
2
]);
*
outptr
=
std
::
max
(
std
::
max
(
max0
,
max1
),
max2
);
r0
+=
2
;
r1
+=
2
;
r2
+=
2
;
outptr
++
;
}
r0
+=
1
+
in_width
;
r1
+=
1
+
in_width
;
r2
+=
1
+
in_width
;
}
input_offset
+=
in_image_size
;
output_offset
+=
out_image_size
;
}
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/pooling_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/pooling.h"
namespace
mace
{
namespace
kernels
{
extern
void
PoolingMaxNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingAvgNeonK2x2S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingMaxNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
extern
void
PoolingAvgNeonK3x3S2x2
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
,
const
int
*
paddings
);
#ifdef __COPY_MAKE_PADDING
extern
void
PoolingMaxNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingAvgNeonK2x2S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingMaxNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
extern
void
PoolingAvgNeonK3x3S2x2Padded
(
const
float
*
input
,
const
index_t
*
in_shape
,
float
*
output
,
const
index_t
*
out_shape
);
#endif
template
<
>
void
PoolingFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
index_t
>
filter_shape
(
4
);
filter_shape
[
0
]
=
input_tensor
->
shape
()[
1
];
filter_shape
[
1
]
=
input_tensor
->
shape
()[
1
];
filter_shape
[
2
]
=
kernels_
[
0
];
filter_shape
[
3
]
=
kernels_
[
1
];
kernels
::
CalcPaddingAndOutputSize
(
input_tensor
->
shape
().
data
(),
filter_shape
.
data
(),
this
->
dilations_
,
strides_
,
this
->
padding_
,
output_shape
.
data
(),
paddings
.
data
());
output_tensor
->
Resize
(
output_shape
);
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
const
index_t
*
input_shape
=
input_tensor
->
shape
().
data
();
#ifdef __COPY_MAKE_PADDING
Tensor
padded_input
;
ConstructInputWithPadding
(
input_tensor
,
paddings
.
data
(),
&
padded_input
);
input
=
padded_input
.
data
<
float
>
();
input_shape
=
padded_input
.
shape
().
data
();
#endif
if
(
kernels_
[
0
]
==
2
&&
kernels_
[
1
]
==
2
&&
strides_
[
0
]
==
2
&&
strides_
[
1
]
==
2
)
{
// kernel_size: 2x2, strides: 2x2
if
(
pooling_type_
==
MAX
)
{
// MAX_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK2x2S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingMaxNeonK2x2S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
else
{
// AVG_POOL_2x2s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK2x2S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingAvgNeonK2x2S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
}
else
if
(
kernels_
[
0
]
==
3
&&
kernels_
[
1
]
==
3
&&
strides_
[
0
]
==
2
&&
strides_
[
1
]
==
2
)
{
// kernel_size: 3x3, strides: 2x2
if
(
pooling_type_
==
MAX
)
{
// MAX_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingMaxNeonK3x3S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingMaxNeonK3x3S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
else
{
// AVG_POOL_3x3s2x2
#ifdef __COPY_MAKE_PADDING
PoolingAvgNeonK3x3S2x2Padded
(
input
,
input_shape
,
output
,
output_shape
.
data
());
#else
PoolingAvgNeonK3x3S2x2
(
input
,
input_shape
,
output
,
output_shape
.
data
(),
paddings
.
data
());
#endif
}
}
else
{
// not implement yet
PoolingFunctor
<
DeviceType
::
CPU
,
float
>
(
pooling_type_
,
kernels_
,
strides_
,
padding_
,
dilations_
)(
input_tensor
,
output_tensor
,
future
);
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/neon/relu_neon.cc
已删除
100644 → 0
浏览文件 @
07f8ff18
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/relu.h"
#include <arm_neon.h>
namespace
mace
{
namespace
kernels
{
template
<
>
void
ActivationFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
const
float
*
input
=
input_tensor
->
data
<
float
>
();
float
*
output
=
output_tensor
->
mutable_data
<
float
>
();
index_t
size
=
input_tensor
->
size
();
if
(
max_limit_
<
0
)
{
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
kCostPerGroup
)
{
int64_t
count
=
std
::
min
(
static_cast
<
int64_t
>
(
kCostPerGroup
),
size
-
i
);
int
block
=
count
>>
2
;
int
remain
=
count
-
(
block
<<
2
);
const
float
*
inptr
=
input
+
i
;
float
*
outptr
=
output
+
i
;
float32x4_t
zero
=
vdupq_n_f32
(
0.
f
);
for
(;
block
>
0
;
--
block
)
{
float32x4_t
in
=
vld1q_f32
(
inptr
);
float32x4_t
out
=
vmaxq_f32
(
in
,
zero
);
vst1q_f32
(
outptr
,
out
);
inptr
+=
4
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
*
outptr
=
std
::
max
(
*
inptr
,
0.
f
);
++
inptr
;
++
outptr
;
}
}
}
else
{
#pragma omp parallel for
for
(
int64_t
i
=
0
;
i
<
size
;
i
+=
kCostPerGroup
)
{
int64_t
count
=
std
::
min
(
static_cast
<
int64_t
>
(
kCostPerGroup
),
size
-
i
);
int
block
=
count
>>
2
;
int
remain
=
count
-
(
block
<<
2
);
const
float
*
inptr
=
input
+
i
;
float
*
outptr
=
output
+
i
;
float32x4_t
zero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vmax
=
vdupq_n_f32
(
max_limit_
);
for
(;
block
>
0
;
--
block
)
{
float32x4_t
in
=
vld1q_f32
(
inptr
);
float32x4_t
out
=
vmaxq_f32
(
in
,
zero
);
out
=
vminq_f32
(
out
,
vmax
);
vst1q_f32
(
outptr
,
out
);
inptr
+=
4
;
outptr
+=
4
;
}
for
(;
remain
>
0
;
--
remain
)
{
*
outptr
=
std
::
min
(
std
::
max
(
*
inptr
,
0.
f
),
max_limit_
);
++
inptr
;
++
outptr
;
}
}
}
};
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
780f5a60
#
include
<common.h>
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
o
c
*/
__kernel
void
filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
i
c
*/
__private
const
int
filter_w,
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
out_channel,
__private
const
int
out_channel,
__private
const
int
in_channel,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
int
w
=
get_global_id
(
0
)
;
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
int
h
=
get_global_id
(
1
)
;
...
@@ -13,23 +13,26 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
...
@@ -13,23 +13,26 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out
_channel
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
out_channel
+
out_channel_idx
)
*
in
_channel
+
out
_channel_idx
;
+
in
_channel_idx
;
const
int
size
=
out_channel
-
out_channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
if
(
in_channel_idx
<
in_channel
)
{
if
(
out_channel_idx
<
out_channel
)
{
const
int
size
=
out_channel
-
out_channel_idx
;
if
(
size
<
4
)
{
if
(
size
<
4
)
{
switch
(
size
)
{
switch
(
size
)
{
case
3:
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
values.z
=
*
(
input
+
offset
+
2
*
in_channel
)
;
case
2:
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
values.y
=
*
(
input
+
offset
+
1
*
in_channel
)
;
case
1:
case
1:
values.x
=
*
(
input
+
offset
)
;
values.x
=
*
(
input
+
offset
)
;
}
}
}
else
{
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
values.w
=
*
(
input
+
offset
+
3
*
in_channel
)
;
values.z
=
*
(
input
+
offset
+
2
*
in_channel
)
;
values.y
=
*
(
input
+
offset
+
1
*
in_channel
)
;
values.x
=
*
(
input
+
offset
)
;
}
}
}
}
...
@@ -37,10 +40,10 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
...
@@ -37,10 +40,10 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
}
}
__kernel
void
filter_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
h,
w,
ic,
o
c
*/
__kernel
void
filter_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
h,
w,
oc,
i
c
*/
__private
const
int
filter_w,
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
out_channel,
__private
const
int
out_channel,
__private
const
int
in_channel,
__read_only
image2d_t
input
)
{
__read_only
image2d_t
input
)
{
int
w
=
get_global_id
(
0
)
;
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
int
h
=
get_global_id
(
1
)
;
...
@@ -50,29 +53,31 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
...
@@ -50,29 +53,31 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out
_channel
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
out_channel
+
out_channel_idx
)
*
in
_channel
+
out
_channel_idx
;
+
in
_channel_idx
;
if
(
in_channel_idx
<
in
_channel
)
{
if
(
out_channel_idx
<
out
_channel
)
{
int2
coord
=
(
int2
)(
w,
h
)
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER,
coord
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER,
coord
)
;
const
int
size
=
(
out_channel
-
out_channel_idx
)
;
const
int
size
=
(
out_channel
-
out_channel_idx
)
;
if
(
size
<
4
)
{
if
(
size
<
4
)
{
switch
(
size
)
{
switch
(
size
)
{
case
3:
case
3:
output[offset
+2]
=
values.s2
;
output[offset
+
2
*
in_channel]
=
values.z
;
case
2:
case
2:
output[offset
+1]
=
values.s1
;
output[offset
+
1
*
in_channel]
=
values.y
;
case
1:
case
1:
output[offset]
=
values.
s0
;
output[offset]
=
values.
x
;
}
}
}
else
{
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
output[offset
+
3
*
in_channel]
=
values.w
;
output[offset
+
2
*
in_channel]
=
values.z
;
output[offset
+
1
*
in_channel]
=
values.y
;
output[offset]
=
values.x
;
}
}
}
}
}
}
__kernel
void
dw_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
m
*/
__kernel
void
dw_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
m
*/
__private
const
int
filter_w,
__private
const
int
filter_w,
__private
const
int
in_channel,
__private
const
int
in_channel,
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
780f5a60
...
@@ -149,8 +149,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -149,8 +149,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
std
::
vector
<
index_t
>
fake_filter_shape
(
4
);
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
0
]
=
filter
->
shape
()[
0
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
1
]
=
filter
->
shape
()[
1
];
fake_filter_shape
[
3
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
filter
->
shape
()[
2
]
*
filter
->
shape
()[
3
];
fake_filter_shape
[
2
]
=
1
;
fake_filter_shape
[
3
]
=
1
;
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
std
::
vector
<
int
>
paddings
(
2
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
780f5a60
...
@@ -19,12 +19,12 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
...
@@ -19,12 +19,12 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
}
}
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
void
CalConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HW
IO
*/
void
CalConv2dFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HW
OI
*/
std
::
vector
<
size_t
>
&
image_shape
)
{
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
image_shape
.
resize
(
2
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
2
],
4
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
3
],
4
);
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
3
]);
image_shape
[
1
]
=
RoundUpDiv4
(
shape
[
2
]);
}
}
// [H * W * M, (Ic + 3) / 4]
// [H * W * M, (Ic + 3) / 4]
...
@@ -179,6 +179,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
...
@@ -179,6 +179,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
return
{
// TODO tuning these magic numbers
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
16
,
4
,
4
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
{
kwg_size
/
32
,
4
,
8
,
1
},
...
@@ -200,7 +201,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
...
@@ -200,7 +201,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
{
9
,
7
,
15
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
// SNPE size
{
4
,
15
,
8
,
1
},
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
...
...
mace/ops/activation.cc
浏览文件 @
780f5a60
...
@@ -13,14 +13,6 @@ void Register_Activation(OperatorRegistry *op_registry) {
...
@@ -13,14 +13,6 @@ void Register_Activation(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
ActivationOp
<
DeviceType
::
CPU
,
float
>
);
ActivationOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ActivationOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Activation"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/activation_benchmark.cc
浏览文件 @
780f5a60
...
@@ -298,14 +298,15 @@ static void SigmoidBenchmark(
...
@@ -298,14 +298,15 @@ static void SigmoidBenchmark(
} \
} \
BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SIGMOID(N, C, H, W, TYPE) \
#define BM_SIGMOID(N, C, H, W) \
BM_SIGMOID_MACRO(N, C, H, W, TYPE, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
BM_SIGMOID_MACRO(N, C, H, W, TYPE, OPENCL);
BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \
BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL);
BM_SIGMOID
(
1
,
1
,
512
,
512
,
float
);
BM_SIGMOID
(
1
,
3
,
128
,
128
,
float
);
BM_SIGMOID
(
1
,
1
,
512
,
512
);
BM_SIGMOID
(
1
,
3
,
512
,
512
,
float
);
BM_SIGMOID
(
1
,
3
,
128
,
128
);
BM_SIGMOID
(
1
,
32
,
112
,
112
,
float
);
BM_SIGMOID
(
1
,
3
,
512
,
512
);
BM_SIGMOID
(
1
,
64
,
256
,
256
,
float
);
BM_SIGMOID
(
1
,
32
,
112
,
112
);
BM_SIGMOID
(
1
,
64
,
256
,
256
);
}
// namespace mace
}
// namespace mace
mace/ops/activation_test.cc
浏览文件 @
780f5a60
...
@@ -53,10 +53,6 @@ void TestSimpleRelu() {
...
@@ -53,10 +53,6 @@ void TestSimpleRelu() {
TEST_F
(
ActivationOpTest
,
CPUSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
CPUSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelu
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelu
)
{
TestSimpleRelu
<
DeviceType
::
OPENCL
>
();
TestSimpleRelu
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -104,12 +100,6 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
...
@@ -104,12 +100,6 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
TestUnalignedSimpleRelu
<
DeviceType
::
CPU
>
();
TestUnalignedSimpleRelu
<
DeviceType
::
CPU
>
();
}
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONUnalignedSimpleRelu
)
{
TestUnalignedSimpleRelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLUnalignedSimpleRelu
)
{
TEST_F
(
ActivationOpTest
,
OPENCLUnalignedSimpleRelu
)
{
TestUnalignedSimpleRelu
<
DeviceType
::
OPENCL
>
();
TestUnalignedSimpleRelu
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -160,10 +150,6 @@ void TestSimpleRelux() {
...
@@ -160,10 +150,6 @@ void TestSimpleRelux() {
TEST_F
(
ActivationOpTest
,
CPUSimple
)
{
TestSimpleRelux
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
CPUSimple
)
{
TestSimpleRelux
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimple
)
{
TestSimpleRelux
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimple
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimple
)
{
TestSimpleRelux
<
DeviceType
::
OPENCL
>
();
TestSimpleRelux
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -216,12 +202,6 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
...
@@ -216,12 +202,6 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
TestSimpleReluRelux
<
DeviceType
::
CPU
>
();
TestSimpleReluRelux
<
DeviceType
::
CPU
>
();
}
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleRelux
)
{
TestSimpleReluRelux
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelux
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimpleRelux
)
{
TestSimpleReluRelux
<
DeviceType
::
OPENCL
>
();
TestSimpleReluRelux
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -272,12 +252,6 @@ void TestSimplePrelu() {
...
@@ -272,12 +252,6 @@ void TestSimplePrelu() {
TEST_F
(
ActivationOpTest
,
CPUSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
CPUSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimplePrelu
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
OPENCL
>
();
TestSimplePrelu
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -329,10 +303,6 @@ void TestSimpleTanh() {
...
@@ -329,10 +303,6 @@ void TestSimpleTanh() {
TEST_F
(
ActivationOpTest
,
CPUSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
CPUSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
CPU
>
();
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleTanh
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimpleTanh
)
{
TestSimpleTanh
<
DeviceType
::
OPENCL
>
();
TestSimpleTanh
<
DeviceType
::
OPENCL
>
();
}
}
...
@@ -387,12 +357,6 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
...
@@ -387,12 +357,6 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
TestSimpleSigmoid
<
DeviceType
::
CPU
>
();
TestSimpleSigmoid
<
DeviceType
::
CPU
>
();
}
}
#if __ARM_NEON
TEST_F
(
ActivationOpTest
,
NEONSimpleSigmoid
)
{
TestSimpleSigmoid
<
DeviceType
::
NEON
>
();
}
#endif
TEST_F
(
ActivationOpTest
,
OPENCLSimpleSigmoid
)
{
TEST_F
(
ActivationOpTest
,
OPENCLSimpleSigmoid
)
{
TestSimpleSigmoid
<
DeviceType
::
OPENCL
>
();
TestSimpleSigmoid
<
DeviceType
::
OPENCL
>
();
}
}
...
...
mace/ops/addn_benchmark.cc
浏览文件 @
780f5a60
...
@@ -65,16 +65,16 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
...
@@ -65,16 +65,16 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
} \
} \
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_ADDN(INPUTS, N, H, W, C, TYPE) \
#define BM_ADDN(INPUTS, N, H, W, C) \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, NEON); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
BM_ADDN
(
2
,
1
,
256
,
256
,
32
,
float
);
BM_ADDN
(
2
,
1
,
256
,
256
,
32
);
BM_ADDN
(
2
,
1
,
128
,
128
,
32
,
float
);
BM_ADDN
(
2
,
1
,
128
,
128
,
32
);
// BM_ADDN(2, 1, 240, 240, 256, half);
BM_ADDN
(
4
,
1
,
128
,
128
,
3
);
BM_ADDN
(
4
,
1
,
128
,
128
,
3
,
float
);
BM_ADDN
(
2
,
1
,
256
,
256
,
3
);
BM_ADDN
(
2
,
1
,
256
,
256
,
3
,
float
);
BM_ADDN
(
2
,
1
,
512
,
512
,
3
);
BM_ADDN
(
2
,
1
,
512
,
512
,
3
,
float
);
// BM_ADDN(4, 1, 240, 240, 256, half);
}
//
namespace mace
}
// namespace mace
mace/ops/addn_test.cc
浏览文件 @
780f5a60
...
@@ -33,12 +33,8 @@ void SimpleAdd2() {
...
@@ -33,12 +33,8 @@ void SimpleAdd2() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
CPU
>
();
}
TEST_F
(
AddnOpTest
,
CPUSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
CPU
>
();
}
/*
TEST_F
(
AddnOpTest
,
NEONSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
NEON
>
();
}
TEST_F
(
AddnOpTest
,
NEONSimpleAdd2
)
{
SimpleAdd2
<
DeviceType
::
NEON
>
();
}
TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
*/
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
SimpleAdd3
()
{
void
SimpleAdd3
()
{
// Construct graph
// Construct graph
...
@@ -65,9 +61,7 @@ void SimpleAdd3() {
...
@@ -65,9 +61,7 @@ void SimpleAdd3() {
TEST_F
(
AddnOpTest
,
CPUSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
CPU
>
();
}
TEST_F
(
AddnOpTest
,
CPUSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
CPU
>
();
}
/*
TEST_F
(
AddnOpTest
,
NEONSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
NEON
>
();
}
TEST_F
(
AddnOpTest
,
NEONSimpleAdd3
)
{
SimpleAdd3
<
DeviceType
::
NEON
>
();
}
*/
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
RandomTest
()
{
void
RandomTest
()
{
...
...
mace/ops/batch_norm_benchmark.cc
浏览文件 @
780f5a60
...
@@ -82,21 +82,24 @@ static void BatchNorm(
...
@@ -82,21 +82,24 @@ static void BatchNorm(
} \
} \
BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
BENCHMARK(BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BATCH_NORM(N, C, H, W, TYPE) \
#define BM_BATCH_NORM(N, C, H, W) \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL);
BM_BATCH_NORM_MACRO(N, C, H, W, float, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
BM_BATCH_NORM
(
1
,
1
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
1
,
512
,
512
);
BM_BATCH_NORM
(
1
,
3
,
128
,
128
,
float
);
BM_BATCH_NORM
(
1
,
3
,
128
,
128
);
BM_BATCH_NORM
(
1
,
3
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
3
,
512
,
512
);
BM_BATCH_NORM
(
1
,
32
,
112
,
112
,
float
);
BM_BATCH_NORM
(
1
,
32
,
112
,
112
);
BM_BATCH_NORM
(
1
,
64
,
256
,
256
,
float
);
BM_BATCH_NORM
(
1
,
64
,
256
,
256
);
BM_BATCH_NORM
(
1
,
64
,
512
,
512
,
float
);
BM_BATCH_NORM
(
1
,
64
,
512
,
512
);
BM_BATCH_NORM
(
1
,
128
,
56
,
56
,
float
);
BM_BATCH_NORM
(
1
,
128
,
56
,
56
);
BM_BATCH_NORM
(
1
,
128
,
256
,
256
,
float
);
BM_BATCH_NORM
(
1
,
128
,
256
,
256
);
BM_BATCH_NORM
(
1
,
256
,
14
,
14
,
float
);
BM_BATCH_NORM
(
1
,
256
,
14
,
14
);
BM_BATCH_NORM
(
1
,
512
,
14
,
14
,
float
);
BM_BATCH_NORM
(
1
,
512
,
14
,
14
);
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
,
float
);
BM_BATCH_NORM
(
1
,
1024
,
7
,
7
);
BM_BATCH_NORM
(
32
,
1
,
256
,
256
,
float
);
BM_BATCH_NORM
(
32
,
1
,
256
,
256
);
BM_BATCH_NORM
(
32
,
3
,
256
,
256
,
float
);
BM_BATCH_NORM
(
32
,
3
,
256
,
256
);
}
// namespace mace
}
// namespace mace
mace/ops/batch_norm_test.cc
浏览文件 @
780f5a60
...
@@ -72,23 +72,18 @@ void Simple() {
...
@@ -72,23 +72,18 @@ void Simple() {
TEST_F
(
BatchNormOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
/*
TEST_F
(
BatchNormOpTest
,
SimpleNEON
)
{
Simple
<
DeviceType
::
NEON
>
();
}
TEST_F(BatchNormOpTest, SimpleNEON) {
Simple<DeviceType::NEON>();
}
*/
TEST_F
(
BatchNormOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
TEST_F
(
BatchNormOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
OPENCL
>
();
}
/*
TEST_F
(
BatchNormOpTest
,
SimpleRandomNeon
)
{
TEST_F
(
BatchNormOpTest
,
SimpleRandomNeon
)
{
srand
(
time
(
NULL
));
srand
(
time
(
NULL
));
// generate random input
// generate random input
index_t
batch
=
1
+
rand
()
%
10
;
index_t
batch
=
1
+
rand
()
%
10
;
index_t channels = 3 + rand() % 50;
index_t
height
=
64
;
index_t
height
=
64
;
index_t
width
=
64
;
index_t
width
=
64
;
index_t
channels
=
3
+
rand
()
%
50
;
// Construct graph
// Construct graph
OpsTestNet
net
;
OpsTestNet
net
;
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
...
@@ -97,18 +92,17 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
...
@@ -97,18 +92,17 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
.
Input
(
"Offset"
)
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
Input
(
"Var"
)
.
Input("Epsilon"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input",
{batch, channels, height,
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
width
});
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
// run cpu
net
.
RunOp
();
net
.
RunOp
();
...
@@ -139,18 +133,17 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
...
@@ -139,18 +133,17 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
.
Input
(
"Offset"
)
.
Input
(
"Offset"
)
.
Input
(
"Mean"
)
.
Input
(
"Mean"
)
.
Input
(
"Var"
)
.
Input
(
"Var"
)
.
Input("Epsilon"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input",
{batch, channels, height,
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
width
});
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Var"
,
{
channels
},
true
);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
// run cpu
net
.
RunOp
();
net
.
RunOp
();
...
@@ -164,7 +157,6 @@ width});
...
@@ -164,7 +157,6 @@ width});
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-2
);
}
}
*/
TEST_F
(
BatchNormOpTest
,
SimpleRandomOPENCL
)
{
TEST_F
(
BatchNormOpTest
,
SimpleRandomOPENCL
)
{
srand
(
time
(
NULL
));
srand
(
time
(
NULL
));
...
...
mace/ops/batch_to_space_benchmark.cc
浏览文件 @
780f5a60
...
@@ -47,10 +47,10 @@ static void BMBatchToSpace(
...
@@ -47,10 +47,10 @@ static void BMBatchToSpace(
} \
} \
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG
, TYPE
) \
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG,
TYPE
, OPENCL);
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG,
float
, OPENCL);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
,
float
);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
,
float
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
,
float
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
}
// namespace mace
}
// namespace mace
\ No newline at end of file
mace/ops/bias_add.cc
浏览文件 @
780f5a60
...
@@ -13,16 +13,6 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
...
@@ -13,16 +13,6 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
BiasAddOp
<
DeviceType
::
CPU
,
float
>
);
BiasAddOp
<
DeviceType
::
CPU
,
float
>
);
/*
#if __ARM_NEON
REGISTER_OPERATOR(op_registry,OpKeyBuilder("BiasAdd")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
BiasAddOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
*/
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"BiasAdd"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/bias_add_benchmark.cc
浏览文件 @
780f5a60
...
@@ -59,21 +59,22 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
...
@@ -59,21 +59,22 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
} \
} \
BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
BENCHMARK(BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_BIAS_ADD(N, C, H, W, TYPE) \
#define BM_BIAS_ADD(N, C, H, W) \
BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
BM_BIAS_ADD_MACRO(N, C, H, W, TYPE, OPENCL);
BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \
BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL);
BM_BIAS_ADD
(
1
,
1
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
1
,
512
,
512
);
BM_BIAS_ADD
(
1
,
3
,
128
,
128
,
float
);
BM_BIAS_ADD
(
1
,
3
,
128
,
128
);
BM_BIAS_ADD
(
1
,
3
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
3
,
512
,
512
);
BM_BIAS_ADD
(
1
,
32
,
112
,
112
,
float
);
BM_BIAS_ADD
(
1
,
32
,
112
,
112
);
BM_BIAS_ADD
(
1
,
64
,
256
,
256
,
float
);
BM_BIAS_ADD
(
1
,
64
,
256
,
256
);
BM_BIAS_ADD
(
1
,
64
,
512
,
512
,
float
);
BM_BIAS_ADD
(
1
,
64
,
512
,
512
);
BM_BIAS_ADD
(
1
,
128
,
56
,
56
,
float
);
BM_BIAS_ADD
(
1
,
128
,
56
,
56
);
BM_BIAS_ADD
(
1
,
128
,
256
,
256
,
float
);
BM_BIAS_ADD
(
1
,
128
,
256
,
256
);
BM_BIAS_ADD
(
1
,
256
,
14
,
14
,
float
);
BM_BIAS_ADD
(
1
,
256
,
14
,
14
);
BM_BIAS_ADD
(
1
,
512
,
14
,
14
,
float
);
BM_BIAS_ADD
(
1
,
512
,
14
,
14
);
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
,
float
);
BM_BIAS_ADD
(
1
,
1024
,
7
,
7
);
BM_BIAS_ADD
(
32
,
1
,
256
,
256
,
float
);
BM_BIAS_ADD
(
32
,
1
,
256
,
256
);
BM_BIAS_ADD
(
32
,
3
,
256
,
256
,
float
);
BM_BIAS_ADD
(
32
,
3
,
256
,
256
);
}
//
namespace mace
}
// namespace mace
mace/ops/buffer_to_image.cc
浏览文件 @
780f5a60
...
@@ -20,4 +20,4 @@ void Register_BufferToImage(OperatorRegistry *op_registry) {
...
@@ -20,4 +20,4 @@ void Register_BufferToImage(OperatorRegistry *op_registry) {
BufferToImageOp
<
DeviceType
::
OPENCL
,
half
>
);
BufferToImageOp
<
DeviceType
::
OPENCL
,
half
>
);
}
}
}
//
namespace mace
}
// namespace mace
mace/ops/buffer_to_image.h
浏览文件 @
780f5a60
...
@@ -35,5 +35,5 @@ class BufferToImageOp: public Operator<D, T> {
...
@@ -35,5 +35,5 @@ class BufferToImageOp: public Operator<D, T> {
OP_OUTPUT_TAGS
(
OUTPUT
);
OP_OUTPUT_TAGS
(
OUTPUT
);
};
};
}
//
namespace mace
}
//
namespace mace
#endif
//
MACE_OPS_BUFFER_TO_IMAGE_H_
#endif
//
MACE_OPS_BUFFER_TO_IMAGE_H_
mace/ops/conv_2d.cc
浏览文件 @
780f5a60
...
@@ -13,20 +13,6 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
...
@@ -13,20 +13,6 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
half
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Conv2D"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
780f5a60
...
@@ -29,7 +29,7 @@ static void Conv2d(int iters,
...
@@ -29,7 +29,7 @@ static void Conv2d(int iters,
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
channels
,
output_
channels
});
{
kernel_h
,
kernel_w
,
output_channels
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
...
@@ -92,50 +92,46 @@ static void Conv2d(int iters,
...
@@ -92,50 +92,46 @@ static void Conv2d(int iters,
BENCHMARK( \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
// ICNet
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
half
);
//// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
);
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
);
//// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
);
//// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
SAME
,
64
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
BM_CONV_2D
(
1
,
512
,
64
,
64
,
1
,
1
,
1
,
SAME
,
256
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
SAME
,
64
,
half
);
BM_CONV_2D
(
1
,
512
,
64
,
64
,
1
,
1
,
1
,
SAME
,
256
,
half
);
BM_CONV_2D
(
1
,
128
,
16
,
16
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
128
,
64
,
64
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
128
,
16
,
16
,
3
,
3
,
1
,
VALID
,
32
,
half
);
BM_CONV_2D
(
1
,
128
,
128
,
128
,
3
,
3
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
128
,
64
,
64
,
3
,
3
,
1
,
VALID
,
32
,
half
);
BM_CONV_2D
(
1
,
128
,
128
,
128
,
3
,
3
,
1
,
VALID
,
32
,
half
);
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
VALID
,
3
);
// Test RGB <-> YUV
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
VALID
,
128
);
// Test bad alignments
// BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
1
,
1
,
1
,
VALID
,
3
);
//
BM_CONV_2D
(
1
,
32
,
112
,
112
,
1
,
1
,
1
,
VALID
,
64
);
// BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
56
,
56
,
1
,
1
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad
BM_CONV_2D
(
1
,
256
,
28
,
28
,
1
,
1
,
1
,
VALID
,
256
);
// alignments
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
VALID
,
1024
);
// BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
3
);
// BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
128
);
// BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
128
);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
128
);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
3
);
// BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
128
);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
128
);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
128
);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
VALID
,
128
);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
SAME
,
128
);
// BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
SAME
,
128
);
// BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
}
// namespace mace
// BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
// BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
// BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
}
// namespace mace
mace/ops/conv_2d_test.cc
浏览文件 @
780f5a60
...
@@ -10,81 +10,6 @@ using namespace mace;
...
@@ -10,81 +10,6 @@ using namespace mace;
class
Conv2dOpTest
:
public
OpsTestBase
{};
class
Conv2dOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
void
TestSimple3x3VALID
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add args
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
template
<
DeviceType
D
>
void
TestSimple3x3SAME
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
3
,
3
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#if __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
NEON
>
();
TestSimple3x3SAME
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3VALID
()
{
void
TestNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
OpsTestNet
net
;
...
@@ -93,7 +18,7 @@ void TestNHWCSimple3x3VALID() {
...
@@ -93,7 +18,7 @@ void TestNHWCSimple3x3VALID() {
"Input"
,
{
1
,
3
,
3
,
2
},
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
...
@@ -150,7 +75,7 @@ void TestNHWCSimple3x3SAME() {
...
@@ -150,7 +75,7 @@ void TestNHWCSimple3x3SAME() {
"Input"
,
{
1
,
3
,
3
,
2
},
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
...
@@ -211,42 +136,6 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
...
@@ -211,42 +136,6 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3SAME
<
DeviceType
::
OPENCL
,
float
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
OPENCL
,
float
>
();
}
}
template
<
DeviceType
D
>
void
TestSimple3x3WithoutBias
()
{
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#ifdef __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONWithouBias
)
{
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3WithoutBias
()
{
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
OpsTestNet
net
;
...
@@ -256,7 +145,7 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -256,7 +145,7 @@ void TestNHWCSimple3x3WithoutBias() {
"Input"
,
{
1
,
3
,
3
,
2
},
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
1
,
2
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
...
@@ -309,47 +198,6 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
...
@@ -309,47 +198,6 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
,
float
>
();
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
,
float
>
();
}
}
template
<
DeviceType
D
>
static
void
TestCombined3x3
()
{
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
5
,
5
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
2
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Run
net
.
RunOp
(
D
);
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
2
,
3
,
3
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
,
4.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
9.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#ifdef __ARM_NEON
TEST_F
(
Conv2dOpTest
,
NEONCombined
)
{
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
#endif
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
static
void
TestNHWCCombined3x3
()
{
static
void
TestNHWCCombined3x3
()
{
// Construct graph
// Construct graph
...
@@ -362,9 +210,9 @@ static void TestNHWCCombined3x3() {
...
@@ -362,9 +210,9 @@ static void TestNHWCCombined3x3() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
2
},
"Filter"
,
{
3
,
3
,
2
,
2
},
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
{
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
...
@@ -436,8 +284,8 @@ void TestConv1x1() {
...
@@ -436,8 +284,8 @@ void TestConv1x1() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
1
,
5
,
2
},
"Filter"
,
{
1
,
1
,
2
,
5
},
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1
.0
f
,
2.0
f
});
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
2.0
f
,
2.0
f
,
2.0
f
,
2
.0
f
,
2.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
...
@@ -522,7 +370,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -522,7 +370,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
@@ -606,7 +454,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
...
@@ -606,7 +454,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
float_input_data
);
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
(
GenerateRandomRealTypeData
(
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
...
@@ -614,7 +462,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
...
@@ -614,7 +462,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
...
@@ -748,7 +596,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
...
@@ -748,7 +596,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
...
mace/ops/depthwise_conv2d.cc
浏览文件 @
780f5a60
...
@@ -13,14 +13,6 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
...
@@ -13,14 +13,6 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
CPU
,
float
>
);
DepthwiseConv2dOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
DepthwiseConv2dOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"DepthwiseConv2d"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
780f5a60
...
@@ -288,12 +288,6 @@ void TestNxNS12(const index_t height, const index_t width) {
...
@@ -288,12 +288,6 @@ void TestNxNS12(const index_t height, const index_t width) {
}
}
}
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
4
,
4
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
4
,
4
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
4
,
4
);
}
}
...
@@ -302,13 +296,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
...
@@ -302,13 +296,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
4
,
4
);
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
4
,
4
);
}
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
128
,
128
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLAlignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
64
,
64
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
128
,
128
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
128
,
128
);
...
@@ -319,12 +306,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
...
@@ -319,12 +306,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
128
,
128
);
TestNxNS12
<
DeviceType
::
OPENCL
,
half
>
(
128
,
128
);
}
}
#if __ARM_NEON
TEST_F
(
DepthwiseConv2dOpTest
,
NeonUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
NEON
,
float
>
(
107
,
113
);
}
#endif
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12
)
{
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLUnalignedNxNS12
)
{
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
107
,
113
);
TestNxNS12
<
DeviceType
::
OPENCL
,
float
>
(
107
,
113
);
}
}
...
...
mace/ops/depthwise_conv_2d_benchmark.cc
浏览文件 @
780f5a60
...
@@ -89,21 +89,22 @@ static void DepthwiseConv2d(int iters,
...
@@ -89,21 +89,22 @@ static void DepthwiseConv2d(int iters,
BENCHMARK( \
BENCHMARK( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
1
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
,
float
);
BM_DEPTHWISE_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
1
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1
, float
);
//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
}
// namespace mace
}
// namespace mace
mace/ops/folded_batch_norm.cc
浏览文件 @
780f5a60
...
@@ -14,14 +14,6 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
...
@@ -14,14 +14,6 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
FoldedBatchNormOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FoldedBatchNormOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FoldedBatchNorm"
)
OpKeyBuilder
(
"FoldedBatchNorm"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
...
...
mace/ops/fused_conv_2d.cc
浏览文件 @
780f5a60
...
@@ -13,12 +13,6 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
...
@@ -13,12 +13,6 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
FusedConv2dOp
<
DeviceType
::
CPU
,
float
>
);
FusedConv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FusedConv2D"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FusedConv2dOp
<
DeviceType
::
CPU
,
half
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FusedConv2D"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FusedConv2D"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/fused_conv_2d_test.cc
浏览文件 @
780f5a60
...
@@ -298,7 +298,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -298,7 +298,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
@@ -375,7 +375,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -375,7 +375,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
float_input_data
);
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
(
GenerateRandomRealTypeData
(
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
...
@@ -383,7 +383,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -383,7 +383,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
},
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
},
float_filter_data
);
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
...
@@ -462,7 +462,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
...
@@ -462,7 +462,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
@@ -540,7 +540,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
...
@@ -540,7 +540,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
@@ -622,7 +622,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
...
@@ -622,7 +622,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
out
put_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
output_channels
,
in
put_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
...
...
mace/ops/global_avg_pooling.cc
浏览文件 @
780f5a60
...
@@ -12,14 +12,6 @@ void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
...
@@ -12,14 +12,6 @@ void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
.
Build
(),
GlobalAvgPoolingOp
<
DeviceType
::
CPU
,
float
>
);
GlobalAvgPoolingOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"GlobalAvgPooling"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
GlobalAvgPoolingOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
}
}
}
// namespace mace
}
// namespace mace
mace/ops/global_avg_pooling_test.cc
浏览文件 @
780f5a60
...
@@ -31,29 +31,3 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
...
@@ -31,29 +31,3 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
}
#if __ARM_NEON
TEST_F
(
GlobalAvgPoolingOpTest
,
3
x7x7_NEON
)
{
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"GlobalAvgPooling"
,
"GlobalAvgPoolingTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
std
::
vector
<
float
>
input
(
147
);
for
(
int
i
=
0
;
i
<
147
;
++
i
)
{
input
[
i
]
=
i
/
49
+
1
;
}
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
3
,
7
,
7
},
input
);
// Run
net
.
RunOp
(
DeviceType
::
NEON
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
3
,
1
,
1
},
{
1
,
2
,
3
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
#endif
mace/ops/matmul_benchmark.cc
浏览文件 @
780f5a60
...
@@ -61,10 +61,10 @@ static void MatMulBenchmark(
...
@@ -61,10 +61,10 @@ static void MatMulBenchmark(
} \
} \
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE)
#define BM_MATMUL(N, H, C, W
, TYPE)
\
#define BM_MATMUL(N, H, C, W
)
\
BM_MATMUL_MACRO(N, H, C, W,
TYPE
, OPENCL);
BM_MATMUL_MACRO(N, H, C, W,
half
, OPENCL);
BM_MATMUL
(
16
,
32
,
128
,
49
,
half
);
BM_MATMUL
(
16
,
32
,
128
,
49
);
BM_MATMUL
(
16
,
32
,
128
,
961
,
half
);
BM_MATMUL
(
16
,
32
,
128
,
961
);
BM_MATMUL
(
16
,
32
,
128
,
3969
,
half
);
BM_MATMUL
(
16
,
32
,
128
,
3969
);
}
//
namespace mace
}
// namespace mace
mace/ops/pooling.cc
浏览文件 @
780f5a60
...
@@ -18,14 +18,6 @@ void Register_Pooling(OperatorRegistry *op_registry) {
...
@@ -18,14 +18,6 @@ void Register_Pooling(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
PoolingOp
<
DeviceType
::
CPU
,
half
>
);
PoolingOp
<
DeviceType
::
CPU
,
half
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
PoolingOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Pooling"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/resize_bilinear.cc
浏览文件 @
780f5a60
...
@@ -13,14 +13,6 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
...
@@ -13,14 +13,6 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
.
Build
(),
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
CPU
,
float
>
);
ResizeBilinearOp
<
DeviceType
::
CPU
,
float
>
);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
ResizeBilinearOp
<
DeviceType
::
NEON
,
float
>
);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"ResizeBilinear"
)
.
Device
(
DeviceType
::
OPENCL
)
.
Device
(
DeviceType
::
OPENCL
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
mace/ops/resize_bilinear_benchmark.cc
浏览文件 @
780f5a60
...
@@ -69,18 +69,18 @@ static void ResizeBilinearBenchmark(int iters,
...
@@ -69,18 +69,18 @@ static void ResizeBilinearBenchmark(int iters,
BENCHMARK( \
BENCHMARK( \
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE)
BM_RESIZE_BILINEAR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_##TYPE##_##DEVICE)
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1, TYPE) \
#define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1) \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, OPENCL); \
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, OPENCL);
// SNPE 835 GPU: 6870us
BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
128
,
120
,
120
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
,
float
);
BM_RESIZE_BILINEAR
(
1
,
256
,
7
,
7
,
15
,
15
);
BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
,
float
);
BM_RESIZE_BILINEAR
(
1
,
256
,
15
,
15
,
30
,
30
);
BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
,
float
);
BM_RESIZE_BILINEAR
(
1
,
128
,
30
,
30
,
60
,
60
);
BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
128
,
240
,
240
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
,
float
);
BM_RESIZE_BILINEAR
(
1
,
3
,
4032
,
3016
,
480
,
480
);
BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
,
float
);
BM_RESIZE_BILINEAR
(
1
,
3
,
480
,
480
,
4032
,
3016
);
}
//
namespace mace
}
// namespace mace
mace/ops/softmax_benchmark.cc
浏览文件 @
780f5a60
...
@@ -55,13 +55,14 @@ static void SoftmaxBenchmark(
...
@@ -55,13 +55,14 @@ static void SoftmaxBenchmark(
} \
} \
BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
BENCHMARK(BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#define BM_SOFTMAX(N, C, H, W, TYPE) \
#define BM_SOFTMAX(N, C, H, W) \
BM_SOFTMAX_MACRO(N, C, H, W, TYPE, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
BM_SOFTMAX_MACRO(N, C, H, W, TYPE, OPENCL);
BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \
BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL);
BM_SOFTMAX
(
1
,
1
,
512
,
512
,
float
);
BM_SOFTMAX
(
1
,
1
,
512
,
512
);
BM_SOFTMAX
(
1
,
3
,
128
,
128
,
float
);
BM_SOFTMAX
(
1
,
3
,
128
,
128
);
BM_SOFTMAX
(
1
,
3
,
512
,
512
,
float
);
BM_SOFTMAX
(
1
,
3
,
512
,
512
);
BM_SOFTMAX
(
1
,
32
,
112
,
112
,
float
);
BM_SOFTMAX
(
1
,
32
,
112
,
112
);
BM_SOFTMAX
(
1
,
64
,
256
,
256
,
float
);
BM_SOFTMAX
(
1
,
64
,
256
,
256
);
}
// namespace mace
}
// namespace mace
mace/ops/space_to_batch_benchmark.cc
浏览文件 @
780f5a60
...
@@ -49,10 +49,10 @@ static void BMSpaceToBatch(
...
@@ -49,10 +49,10 @@ static void BMSpaceToBatch(
BENCHMARK( \
BENCHMARK( \
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE
, TYPE
) \
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE,
TYPE
, OPENCL);
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE,
float
, OPENCL);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
,
float
);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
,
float
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
,
float
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
}
// namespace mace
}
// namespace mace
\ No newline at end of file
mace/ops/winograd_convolution_test.cc
浏览文件 @
780f5a60
...
@@ -19,9 +19,9 @@ void TransposeFilter(const std::vector<float> &input,
...
@@ -19,9 +19,9 @@ void TransposeFilter(const std::vector<float> &input,
const
float
*
input_ptr
=
input
.
data
();
const
float
*
input_ptr
=
input
.
data
();
for
(
index_t
h
=
0
;
h
<
input_shape
[
0
];
++
h
)
{
for
(
index_t
h
=
0
;
h
<
input_shape
[
0
];
++
h
)
{
for
(
index_t
w
=
0
;
w
<
input_shape
[
1
];
++
w
)
{
for
(
index_t
w
=
0
;
w
<
input_shape
[
1
];
++
w
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
2
];
++
i
c
)
{
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
2
];
++
o
c
)
{
for
(
index_t
oc
=
0
;
oc
<
input_shape
[
3
];
++
o
c
)
{
for
(
index_t
ic
=
0
;
ic
<
input_shape
[
3
];
++
i
c
)
{
int
offset
=
((
oc
*
input_shape
[
2
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
int
offset
=
((
oc
*
input_shape
[
3
]
+
ic
)
*
input_shape
[
0
]
+
h
)
*
input_shape
[
1
]
+
w
;
output
[
offset
]
=
*
input_ptr
;
output
[
offset
]
=
*
input_ptr
;
++
input_ptr
;
++
input_ptr
;
}
}
...
@@ -43,7 +43,7 @@ void WinogradConvolution(const index_t batch,
...
@@ -43,7 +43,7 @@ void WinogradConvolution(const index_t batch,
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
// Add input data
std
::
vector
<
float
>
filter_data
;
std
::
vector
<
float
>
filter_data
;
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
in_channels
,
out
_channels
};
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
out_channels
,
in
_channels
};
GenerateRandomRealTypeData
<
float
>
(
filter_shape
,
filter_data
);
GenerateRandomRealTypeData
<
float
>
(
filter_shape
,
filter_data
);
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
in_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
in_channels
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
filter_shape
,
filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
filter_shape
,
filter_data
);
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
780f5a60
...
@@ -48,12 +48,12 @@ static void BMWinogradTransform(
...
@@ -48,12 +48,12 @@ static void BMWinogradTransform(
BENCHMARK( \
BENCHMARK( \
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_TRANSFORM(N, H, W, C
, TYPE
) \
#define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C,
TYPE
, OPENCL);
BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C,
half
, OPENCL);
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
16
,
16
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
64
,
64
,
128
);
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
,
half
);
BM_WINOGRAD_TRANSFORM
(
1
,
128
,
128
,
128
);
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
static
void
BMWinogradInverseTransform
(
static
void
BMWinogradInverseTransform
(
...
@@ -100,11 +100,11 @@ static void BMWinogradInverseTransform(
...
@@ -100,11 +100,11 @@ static void BMWinogradInverseTransform(
BENCHMARK( \
BENCHMARK( \
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C
, TYPE
) \
#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C,
TYPE
, OPENCL);
BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C,
half
, OPENCL);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
14
,
14
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
62
,
62
,
32
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
,
half
);
BM_WINOGRAD_INVERSE_TRANSFORM
(
1
,
126
,
126
,
32
);
}
// namespace mace
}
// namespace mace
\ No newline at end of file
mace/utils/utils.h
浏览文件 @
780f5a60
...
@@ -30,6 +30,11 @@ Integer RoundUpDiv8(Integer i) {
...
@@ -30,6 +30,11 @@ Integer RoundUpDiv8(Integer i) {
return
(
i
+
7
)
>>
3
;
return
(
i
+
7
)
>>
3
;
}
}
template
<
typename
Integer
>
Integer
RoundUpDiv
(
Integer
i
,
Integer
factor
)
{
return
(
i
+
factor
-
1
)
/
factor
;
}
template
<
typename
Integer
>
template
<
typename
Integer
>
Integer
CeilQuotient
(
Integer
a
,
Integer
b
)
{
Integer
CeilQuotient
(
Integer
a
,
Integer
b
)
{
return
(
a
+
b
-
1
)
/
b
;
return
(
a
+
b
-
1
)
/
b
;
...
...
tools/bazel-adb-run.sh
浏览文件 @
780f5a60
...
@@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//}
...
@@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//}
BAZEL_BIN_PATH
=
bazel-bin/
$BAZEL_BIN_PATH
BAZEL_BIN_PATH
=
bazel-bin/
$BAZEL_BIN_PATH
BIN_NAME
=
`
echo
$BAZEL_TARGET
|
cut
-d
:
-f2
`
BIN_NAME
=
`
echo
$BAZEL_TARGET
|
cut
-d
:
-f2
`
ANDROID_ABI
=
arm64-v8a
ANDROID_ABI
=
armeabi-v7a
ANDROID_ABI
=
armeabi-v7a
ANDROID_ABI
=
arm64-v8a
STRIP
=
"--strip always"
STRIP
=
"--strip always"
VLOG_LEVEL
=
0
VLOG_LEVEL
=
0
PROFILING
=
"1"
PROFILING
=
"1"
...
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
...
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
--copt
=
"-D_GLIBCXX_USE_C99_MATH_TR1"
\
--copt
=
"-D_GLIBCXX_USE_C99_MATH_TR1"
\
--copt
=
"-DMACE_DISABLE_NO_TUNING_WARNING"
\
--copt
=
"-DMACE_DISABLE_NO_TUNING_WARNING"
\
--copt
=
"-Werror=return-type"
\
--copt
=
"-Werror=return-type"
\
--define
neon
=
false
\
--copt
=
"-O3"
\
--define
neon
=
true
\
--define
openmp
=
true
--define
openmp
=
true
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录